示例#1
0
def get_species_neurons_correlations():
    activations = np.load(output_path('activations.npy'))
    logits = np.load(output_path('logits.npy'))
    print_info("calculate correlation matrix between features and species")

    mean_act = np.mean(activations, axis=0)
    std_act = np.std(activations, axis=0)
    norm_act = (activations - mean_act) / std_act

    mean_log = np.mean(logits, axis=0)
    std_log = np.std(logits, axis=0)
    norm_log = (logits - mean_log) / std_log

    size = activations.shape[0] * activations.shape[1]
    c = size - np.count_nonzero(activations)
    print(str(c) + "/" + str(size) + " (" + str(c * 100.0 / size) + "%)")

    matrix = np.zeros((activations.shape[1], logits.shape[1]), dtype=float)

    for i in progressbar.progressbar(range(activations.shape[0])):
        act = norm_act[i]
        log = norm_log[i]
        for j in range(norm_act.shape[1]):
            matrix[j] += (log * act[j]) / activations.shape[0]

    result_path = output_path('correlation_activations.npy')
    print_info("save activations for species:", result_path)
    np.save(result_path, matrix)
    print_info("saved !")
示例#2
0
def export_results(dataset, predictions, size=50, header=False):
    order = np.argsort(-predictions, axis=1)
    results = []

    export_path = output_path('predictions.csv')

    # check if labels have been indexed
    index_path = output_path('index.json')

    indexed_labels = get_index(index_path)

    for i in range(order.shape[0]):
        for j in range(size):
            jth = order[i][j]
            proba = predictions[i][jth]
            if indexed_labels is None:
                class_id = jth
            else:
                if jth in indexed_labels:
                    class_id = indexed_labels[jth]
                else:
                    continue
            _id = int(dataset.ids[i])
            results.append([_id, class_id, j + 1, proba])

    df = pd.DataFrame(data=results,
                      columns=['id', 'class_id', 'rank', 'proba'])
    df.to_csv(export_path, sep=';', header=header, index=False)
    print_statistics('Predictions saved at: ' + export_path)
def save_checkpoint(model,
                    optimizer=None,
                    model_name='model',
                    validation_id=None):
    """
    save checkpoint (optimizer and model)
    :param model_name:
    :param validation_id:
    :param model:
    :param optimizer:
    :return:
    """
    path = output_path(_checkpoint_path.format(model_name),
                       validation_id=validation_id,
                       have_validation=True)

    print_debug('Saving checkpoint: ' + path)

    model = model.module if type(model) is torch.nn.DataParallel else model

    checkpoint = {'model_state_dict': model.state_dict()}

    if optimizer is not None:
        checkpoint['optimizer_state_dict'] = optimizer.state_dict()

    torch.save(checkpoint, path)
示例#4
0
def save_loss(losses, ylabel='Loss'):

    min_freq = min(losses.values(), key=lambda x: x[1])[1]
    if min_freq == 0:
        return
    plt('loss').title('Losses curve')
    plt('loss').xlabel('x' + str(min_freq) + ' batches')
    plt('loss').ylabel(ylabel)

    for k in losses:

        offset = losses[k][1] // min_freq - 1
        plt('loss').plot(
            # in order to align the multiple losses
            [
                i for i in range(
                    offset,
                    len(losses[k][0]) * (losses[k][1] // min_freq) +
                    offset, losses[k][1] // min_freq)
            ],
            losses[k][0],
            label=k)

        _json = json.dumps(losses[k][0])
        path = output_path('loss_{}.logs'.format(k))
        print_debug('Exporting loss at ' + path)
        f = open(path, "w")
        f.write(_json)
        f.close()
    plt('loss').legend()
    save_fig_direct_call(figure_name='loss')
示例#5
0
def do_extraction(dataset, labels_index, file_name='representation_tsne'):
    representation, colors, labels = extract_representation(dataset, model, labels_index=labels_index)

    representation_embedded = TSNE(n_components=2).fit_transform(representation)

    zipped = list(zip(representation_embedded, colors, labels))
    zipped.sort(key=lambda tup: tup[2])
    c = zipped[0][2]
    artists = []
    col, rep = [], []

    artists.append((rep, col, c))

    for row in zipped:
        if row[2] != c:
            col, rep = [], []
            c = row[2]
            artists.append((rep, col, c))
        col.append(row[1])
        rep.append(row[0])

    # converting to numpy
    for i in range(len(artists)):
        artists[i] = (np.array(artists[i][0]), np.array(artists[i][1]), artists[i][2])

    path = output_path(file_name + '.dump')
    with open(path, 'wb') as f:
        pickle.dump(artists, f)
    print_info('Representation saved at: ' + path)
    def create_sparse(self,
                      long_lat_df,
                      size=64,
                      step=1,
                      error_extract_folder=None,
                      error_cache_size=1000,
                      white_percent_allowed=20,
                      check_file=True):
        """
        The main extraction method for multiple extractions
        :param long_lat_df:
        :param destination_directory:
        :param size:
        :param step:
        :param error_extract_folder:
        :param error_cache_size:
        :param white_percent_allowed:
        :param check_file:
        """

        error_manager = _ErrorManager(
            self.in_proj,
            self.ign_proj,
            output_path()
            if error_extract_folder is None else error_extract_folder,
            cache_size=error_cache_size)

        total = long_lat_df.shape[0]
        start = datetime.datetime.now()
        extract_time = 0

        for idx, row in enumerate(long_lat_df.iterrows()):
            longitude, latitude = row[1][0], row[1][1]
            patch_id = int(row[1][2])

            if idx % 100000 == 99999:
                _print_details(idx + 1, total, start, extract_time, latitude,
                               longitude, len(error_manager))

            t1 = ti.time()
            t2 = 0
            try:
                patch = self.extract_patch(
                    latitude,
                    longitude,
                    size,
                    step,
                    identifier=int(patch_id),
                    white_percent_allowed=white_percent_allowed)
            except ExtractionError as err:
                t2 = ti.time()
                error_manager.append(err)
            else:
                t2 = ti.time()

            finally:
                delta = t2 - t1
                extract_time += delta

        error_manager.write_errors()
示例#7
0
def create_ign_sparse(source_occ,
                      source_ign,
                      patch_size=64,
                      error_path=output_path("error_extract/"),
                      **kwargs):

    r = check_source(source_occ)
    occurrences = r['occurrences']
    r = check_source(source_ign)
    ign_images = r['maps']

    la93 = Proj(init='epsg:2154')

    # extract manager
    im_manager = IGNImageManager(ign_images)
    extract_size = patch_size
    extract_step = 1

    # loading the occurrence file
    df = pd.read_csv(occurrences, header='infer', sep=';', low_memory=False)
    max_lat = df['Latitude'].max()
    print(max_lat)

    # sorting the dataset to optimise the extraction
    df.sort_values('Latitude', inplace=True)

    print_info(str(len(df)) + ' occurrences to extract!')
def species_train_test_occurrences(label_species,
                                   train,
                                   val,
                                   test,
                                   species=4448):
    index = output_path('index.json')

    with open(index, 'r') as f:
        s = f.read()
        index_dic = ast.literal_eval(s)

    with open(label_species, 'r') as f:
        s = f.read()
        label_name_dic = ast.literal_eval(s)

    use_label = list(index_dic.keys())[list(index_dic.values()).index(species)]

    datasets = [train, val, test]

    list_occs = [[], [], []]

    for k, d in enumerate(datasets):
        for i, label in enumerate(d.labels):
            if label == int(use_label):
                list_occs[k].append(d.dataset[i])
    print(list_occs)

    for o in list_occs[0]:
        print('%f\t%f\tcircle6\tblue\ttrain' % (o[0], o[1]))
    for o in list_occs[1]:
        print('%f\t%f\tcircle6\tgreen\tval' % (o[0], o[1]))
    for o in list_occs[2]:
        print('%f\t%f\tcircle6\tred\ttest' % (o[0], o[1]))
def plot_species_on_map(grid_points,
                        label_species=None,
                        species=0,
                        log_scale=False,
                        figsize=5,
                        mean_size=1,
                        softmax=False,
                        alpha=None):
    if softmax:
        acts = np.load(output_path('predictions.npy'))
    else:
        acts = np.load(output_path('logits.npy'))

    index = output_path('index.json')

    with open(index, 'r') as f:
        s = f.read()
        index_dic = ast.literal_eval(s)

    use_label = list(index_dic.keys())[list(index_dic.values()).index(species)]

    if label_species is not None:

        with open(label_species, 'r') as f:
            s = f.read()
            label_name_dic = ast.literal_eval(s)

        true_label = index_dic[str(use_label)]
        legend = label_name_dic[true_label]

    else:
        legend = str(species)

    # activations has shape nb points x last layer size

    plot_on_map(acts,
                grid_points.ids,
                n_cols=1,
                n_rows=1,
                figsize=figsize,
                log_scale=log_scale,
                mean_size=mean_size,
                selected=(int(use_label), ),
                alpha=alpha,
                legend=(legend, ),
                output="s" + str(species) + "_pred")
示例#10
0
def save_fig_direct_call(path=None, figure_name=None, extension='jpeg'):
    if '.' not in extension:
        extension = '.' + extension
    global figures
    if figure_name is None:
        for k in figures.keys():
            path_name = output_path(k + extension)
            figure = figures[k][1]
            _save_fig(path_name, figure)
            matplotlib.pyplot.close(figure)
        figures = {}
    else:
        path_name = output_path(figure_name +
                                extension) if path is None else path
        figure = figures[figure_name][1]
        _save_fig(path_name, figure)
        fig = figures.pop(figure_name)
        matplotlib.pyplot.close(fig[1])
示例#11
0
def save_classifier_weight(model):
    w = model.state_dict()['fc.weight']
    w = w.numpy()
    print(w)
    print(type(w))
    print_info("save weight")
    result_path = output_path('weight.npy')
    np.save(result_path, w)
    print_info("saved !")
def _load_checkpoint(model_name, path=None):
    if path is None:
        path = output_path(_checkpoint_path.format(model_name),
                           have_validation=True)

    global _checkpoint
    if not os.path.isfile(path):
        print_errors('{} does not exist'.format(path), do_exit=True)
    print_debug('Loading checkpoint from ' + path)
    _checkpoint[model_name] = torch.load(path)
示例#13
0
def load_loss(name):
    path = output_path(name + '.logs')
    print_debug('Loading loss at ' + path)
    if os.path.exists(path):
        with open(path) as f:
            loss = json.load(f)
        return loss
    else:
        print_debug(path + ' does not exist...')
    return []
示例#14
0
    def _config(self, input_data):
        self.filter = np.zeros((input_data.shape[1], ))
        index_path = output_path('index.json')
        indexed_labels = reverse_indexing(get_index(index_path))

        with open(self.filter_file_path) as f:
            for l in f:
                if int(l) in indexed_labels:
                    self.filter[indexed_labels[int(l)]] = 1.
                elif indexed_labels is None:
                    self.filter[int(l)] = 1.
def plot_occurrences(train, val, test):

    # df_train = pd.read_csv("/home/bdeneu/data/occurrences_glc18.csv", header='infer', sep=';', low_memory=False)
    # df_test = pd.read_csv("/home/bdeneu/data/occurrences_glc18_test_withlabel.csv", header='infer', sep=';', low_memory=False)
    # d_train = df_train[['Latitude', 'Longitude']].to_numpy()
    # d_test = df_test[['Latitude', 'Longitude']].to_numpy()

    d_train = np.asarray(train.dataset)
    d_test = np.asarray(test.dataset)
    d_val = np.asarray(val.dataset)

    geo_tr = project(d_train[:, 0], d_train[:, 1])
    #geo_te = project(d_test[:, 0], d_test[:, 1])
    #geo_va = project(d_val[:, 0], d_val[:, 1])

    #print(geo_te)
    s = 0.8
    plt.style.use('classic')
    fig, ax = plt.subplots()
    #ax.scatter(geo_tr[0][:], geo_tr[1][:], color='#00cc99', marker='s', s=s, label="train")
    ax.scatter(geo_tr[0][:], geo_tr[1][:], color='#93c47d', marker='s', s=s, label="train")
    #ax.scatter(geo_va[0][:], geo_va[1][:], color='#33ff33', marker='s', s=s, label="val")
    #ax.scatter(geo_te[0][:], geo_te[1][:], color='#d9ff66', marker='s', s=s, label="test")
    # ax = fig.add_subplot(111, axisbg='white')

    ax.set_xlim(3200, 4400)
    ax.set_ylim(2000, 3200)
    ax.spines['bottom'].set_color('#dddddd')
    ax.spines['top'].set_color('#dddddd')
    ax.spines['right'].set_color('#dddddd')
    ax.spines['left'].set_color('#dddddd')
    ax.tick_params(axis='x', colors='#dddddd')
    ax.tick_params(axis='y', colors='#dddddd')
    ax.yaxis.label.set_color('#dddddd')
    ax.xaxis.label.set_color('#dddddd')
    ax.title.set_color('#dddddd')
    #plt.legend(loc=1, markerscale=0.8, facecolor='#00FFFFFF')
    print("here")
    plt.show()
    print_info('figure saved at: ' + output_path('occurrences.png'))
    fig.savefig(output_path('occurrences.png'), transparent=True)
def load_checkpoint(model, model_name='model', validation_id=None):
    """
    change state of the model
    """
    path = output_path(_checkpoint_path.format(model_name),
                       validation_id=validation_id,
                       have_validation=True)
    _load_model(
        model.module if type(model) is torch.nn.DataParallel else model,
        model_name,
        path=path,
        reload=True)
示例#17
0
def get_species_neurons_activations(model, grid_points, batch_size=32):
    activations = predict_grid(model, grid_points, batch_size=batch_size, features_activation=True)
    predictions = predict_grid(model, grid_points, batch_size=batch_size)
    logits = predict_grid(model, grid_points, batch_size=batch_size, logit=True)

    result_path = output_path('activations.npy')
    print_info("save activations:", result_path)
    np.save(result_path, activations)
    result_path = output_path('predictions.npy')
    print_info("save predictions:", result_path)
    np.save(result_path, predictions)
    result_path = output_path('logits.npy')
    print_info("save logits", result_path)
    np.save(result_path, logits)
    print_info("saved !")

    print_info("save weight")
    w = model.state_dict()['fc.weight']
    w = w.numpy()
    result_path = output_path('weight.npy')
    np.save(result_path, w)
    print_info("saved !")
示例#18
0
 def __init__(self,
              train,
              top_k=30,
              n_species=4520,
              final_validation=False):
     super().__init__(final_validation, True)
     self.file_name = output_path("_result_top" + str(top_k) +
                                  "_for_all_species.npy")
     self.top_k = top_k
     self.train = train
     self.prior = np.zeros(n_species, dtype=int)
     for label in self.train.labels:
         self.prior[label] += 1
示例#19
0
    def last_call(self):
        step = 0.005
        x = np.arange(-1, 1. + step, step)
        y = np.sqrt(np.maximum(1. - x**2, np.zeros(x.shape)))
        plt('circle').plot(x, y)
        y = -np.sqrt(np.maximum(1. - x**2, np.zeros(x.shape)))
        plt('circle').plot(x, y)
        labels = self.dataset.labels
        dataset = self.dataset.dataset
        plt('circle').scatter(dataset[labels == 0][:, 0],
                              dataset[labels == 0][:, 1])
        plt('circle').scatter(dataset[labels == 1][:, 0],
                              dataset[labels == 1][:, 1])
        for i, p in enumerate(self.parameters[0]):
            norm = np.sqrt(p[0]**2 + p[1]**2)
            if norm > self.coef_norm:
                self.coef_norm = norm

        for i, p in enumerate(self.parameters[0]):
            p /= self.coef_norm
            norm = np.sqrt(p[0]**2 + p[1]**2)

            new_norm = norm * self.wk[0][i] if self.use_wk else norm

            b = -self.bias[0][i] if self.use_bias else 0.
            b /= norm
            dx, dy = p[0] * new_norm / norm, p[1] * new_norm / norm

            x, y = (0, 0) if not self.use_bias else (p[0] * b / norm,
                                                     p[1] * b / norm)

            self.arrows.append(
                plt('circle').arrow(x,
                                    y,
                                    dx,
                                    dy,
                                    shape='full',
                                    head_width=0.04,
                                    head_length=0.08))

        fig = get_figure('circle')
        self.axis = fig.gca()

        anim = FuncAnimation(fig,
                             self.update,
                             frames=np.arange(0, len(self.parameters)),
                             interval=200)
        path = output_path('circle.gif')
        print_info('Saving GIF at ' + path)
        anim.save(path, dpi=80, writer='imagemagick')
        delete_figure('circle')
示例#20
0
def export_bigdata(model, test, batch_size, buffer_size, size):
    num_workers = special_parameters.nb_workers
    test_loader = torch.utils.data.DataLoader(test,
                                              shuffle=False,
                                              batch_size=batch_size,
                                              num_workers=num_workers)

    results = []

    model.eval()
    export_path = output_path('predictions.csv')
    # check if labels have been indexed
    index_path = output_path('index.json')

    indexed_labels = get_index(index_path)

    with open(export_path, 'w') as f:
        print_info('Exporting predictions at ' + export_path)
        f.write('id,class_id,rank,proba\n')  # header

        warnings.simplefilter(
            'ignore')  # warning because old import in progressbar
        bar = progressbar.ProgressBar(max_value=len(test_loader))
        warnings.simplefilter('default')
        for idx, data in enumerate(test_loader):
            # get the inputs
            inputs, labels = data

            outputs = model(inputs)

            results.append(outputs.detach().cpu().numpy())
            if len(results) >= buffer_size:
                _export_bigdata(f, results, test, indexed_labels, size)
                results = []
            bar.update(idx)
        if len(results) >= 0:
            _export_bigdata(f, results, test, indexed_labels, size)
        bar.finish()
def plot_activations_on_map(grid_points,
                            n_rows=3,
                            n_cols=5,
                            selected=tuple(),
                            log_scale=False,
                            figsize=4,
                            mean_size=10):
    activations = np.load(output_path('activations.npy'))

    # activations has shape nb points x last layer size

    plot_on_map(activations,
                grid_points.ids,
                n_cols=n_cols,
                n_rows=n_rows,
                figsize=figsize,
                log_scale=log_scale,
                mean_size=mean_size,
                selected=selected)
示例#22
0
import torch

from engine.logging import print_h1
from engine.path import output_path

model_params = {
    # for inception, aux_logits must be False
    'model_name': 'inception',
    'num_classes': 2,
    'feature_extract': True
}
input_size = 299  # inception
generator = PaintingDatasetGenerator(source='paintings_xviii')

export_result = output_path('results.csv')

painter_list = generator.unique_painters()

with open(export_result, 'w') as f:
    f.write('painter_val;painter_test;prediction;true_label\n')

for i in range(len(painter_list)):
    painter_val = painter_list[i]
    painter_test = painter_list[(i + 1) % len(painter_list)]
    print_h1('||| PAINTER VAL: ' + painter_val + ', PAINTER TEST: ' +
             painter_test + ' |||')

    train, val, test, _ = generator.country_dataset_one_fold(
        painter_val=painter_val, painter_test=painter_test)
def check_extraction(source,
                     save_errors=True,
                     save_filtered=True,
                     id_name='X_key'):
    """
    check if all patches from an occurrences file have been extracted. Can save the list of errors and
    filtered the dataset keeping the correctly extracted data.

    :param id_name: the column that contains the patch id that will be used to construct its path
    :param save_filtered: save the dataframe filtered from the error
    :param save_errors: save the errors found in a file
    :param source: the source referring the occurrence file and the patches path
    """

    # retrieve details of the source
    r = check_source(source)
    if 'occurrences' not in r or 'patches' not in r:
        print_errors(
            'Only sources with occurrences and patches can be checked',
            do_exit=True)

    df = pd.read_csv(r['occurrences'],
                     header='infer',
                     sep=';',
                     low_memory=False)
    nb_errors = 0
    errors = []
    for idx, row in progressbar.progressbar(enumerate(df.iterrows())):
        patch_id = str(int(row[1][id_name]))

        # constructing the path of a patch given its id
        path = os.path.join(r['patches'], patch_id[-2:], patch_id[-4:-2],
                            patch_id + '.npy')

        # if the path does not correspond to a file, then it's an error
        if not os.path.isfile(path):
            errors.append(row[1][id_name])
            nb_errors += 1

    if nb_errors > 0:
        # summary of the error
        print_info(str(nb_errors) + ' errors found during the check...')

        if save_errors:
            # filter the dataframe using the errors
            df_errors = df[df[id_name].isin(errors)]

            error_path = output_path('_errors.csv')
            print_info('Saving error file at: ' + error_path)

            # save dataframe to the error file
            df_errors.to_csv(error_path, header=True, index=False, sep=';')
        if save_filtered:
            # filter the dataframe keeping the non errors
            df_filtered = df[~df[id_name].isin(errors)]
            filtered_path = r['occurrences'] + '.tmp'
            print_info('Saving filtered dataset at: ' + filtered_path)
            df_filtered.to_csv(filtered_path,
                               header=True,
                               index=False,
                               sep=';')
    else:
        print_info('No error has been found!')
示例#24
0
def fit(train,
        test,
        validation=None,
        validation_params=None,
        export_params=None,
        model_name='model',
        **kwargs):
    """
    Fit a light GBM model. If validation_only or export is True, then the training is not performed and the model is
    loaded.
    :param model_name:
    :param export_params:
    :param validation_params:
    :param train:
    :param test:
    :param validation:
    :param kwargs:
    :return:
    """

    nb_labels = _nb_labels(train, test, validation)

    train_data = _to_lgb_dataset(train)
    test_data = _to_lgb_dataset(test)
    val_data = test_data if validation is None else _to_lgb_dataset(validation)

    if not (special_parameters.validation_only or special_parameters.export):
        print_h1('Training: ' + special_parameters.setup_name)
        num_round = 10

        param = kwargs
        merge_smooth(param, _default_params)
        param['num_class'] = nb_labels

        bst = lgb.train(param, train_data, num_round, valid_sets=[val_data])
        bst.save_model(output_path('models/{}.bst'.format(model_name)))
    else:
        bst = lgb.Booster(
            model_file=output_path('models/{}.bst'.format(model_name)))

    print_h1('Validation/Export: ' + special_parameters.setup_name)

    testset, labels = test.numpy()
    predictions = bst.predict(testset)

    # validation
    if special_parameters.validation_only or not special_parameters.export:
        res = validate(
            predictions,
            labels,
            **({} if validation_params is None else validation_params),
            final=True)

        print_notification(res, end='')

        if special_parameters.mail >= 1:
            send_email('Final results for XP ' + special_parameters.setup_name,
                       res)
        if special_parameters.file:
            save_file(output_path('validation.txt'),
                      'Final results for XP ' + special_parameters.setup_name,
                      res)

    if special_parameters.export:
        export_results(test, predictions,
                       **({} if export_params is None else export_params))
示例#25
0
def fit(model_z,
        train,
        test,
        val=None,
        training_params=None,
        predict_params=None,
        validation_params=None,
        export_params=None,
        optim_params=None,
        model_selection_params=None):
    """
    This function is the core of an experiment. It performs the ml procedure as well as the call to validation.
    :param training_params: parameters for the training procedure
    :param val: validation set
    :param test: the test set
    :param train: The training set
    :param optim_params:
    :param export_params:
    :param validation_params:
    :param predict_params:
    :param model_z: the model that should be trained
    :param model_selection_params:
    """
    # configuration

    training_params, predict_params, validation_params, export_params, optim_params, \
        cv_params = merge_dict_set(
            training_params, TRAINING_PARAMS,
            predict_params, PREDICT_PARAMS,
            validation_params, VALIDATION_PARAMS,
            export_params, EXPORT_PARAMS,
            optim_params, OPTIM_PARAMS,
            model_selection_params, MODEL_SELECTION_PARAMS
        )

    train_loader, test_loader, val_loader = _dataset_setup(
        train, test, val, **training_params)

    statistics_path = output_path('metric_statistics.dump')

    metrics_stats = Statistics(
        model_z, statistics_path, **
        cv_params) if cv_params.pop('cross_validation') else None

    validation_path = output_path('validation.txt')

    # training parameters
    optim = optim_params.pop('optimizer')
    iterations = training_params.pop('iterations')
    gamma = training_params.pop('gamma')
    loss = training_params.pop('loss')
    log_modulo = training_params.pop('log_modulo')
    val_modulo = training_params.pop('val_modulo')
    first_epoch = training_params.pop('first_epoch')

    # callbacks for ml tests
    vcallback = validation_params.pop(
        'vcallback') if 'vcallback' in validation_params else None

    if iterations is None:
        print_errors(
            'Iterations must be set',
            exception=TrainingConfigurationException('Iterations is None'))

    # before ml callback
    if vcallback is not None and special_parameters.train and first_epoch < max(
            iterations):
        init_callbacks(vcallback, val_modulo,
                       max(iterations) // val_modulo, train_loader.dataset,
                       model_z)

    max_iterations = max(iterations)

    if special_parameters.train and first_epoch < max(iterations):
        print_h1('Training: ' + special_parameters.setup_name)

        loss_logs = [] if first_epoch < 1 else load_loss('loss_train')

        loss_val_logs = [] if first_epoch < 1 else load_loss('loss_validation')

        opt = create_optimizer(model_z.parameters(), optim, optim_params)

        scheduler = MultiStepLR(opt, milestones=list(iterations), gamma=gamma)

        # number of batches in the ml
        epoch_size = len(train_loader)

        # one log per epoch if value is -1
        log_modulo = epoch_size if log_modulo == -1 else log_modulo

        epoch = 0
        for epoch in range(max_iterations):

            if epoch < first_epoch:
                # opt.step()
                _skip_step(scheduler, epoch)
                continue
            # saving epoch to enable restart
            export_epoch(epoch)
            model_z.train()

            # printing new epoch
            print_h2('-' * 5 + ' Epoch ' + str(epoch + 1) + '/' +
                     str(max_iterations) + ' (lr: ' + str(scheduler.get_lr()) +
                     ') ' + '-' * 5)

            running_loss = 0.0

            for idx, data in enumerate(train_loader):

                # get the inputs
                inputs, labels = data

                # wrap labels in Variable as input is managed through a decorator
                # labels = model_z.p_label(labels)
                if use_gpu():
                    labels = labels.cuda()

                # zero the parameter gradients
                opt.zero_grad()
                outputs = model_z(inputs)
                loss_value = loss(outputs, labels)
                loss_value.backward()

                opt.step()

                # print math
                running_loss += loss_value.item()
                if idx % log_modulo == log_modulo - 1:  # print every log_modulo mini-batches
                    print('[%d, %5d] loss: %.5f' %
                          (epoch + 1, idx + 1, running_loss / log_modulo))

                    # tensorboard support
                    add_scalar('Loss/train', running_loss / log_modulo)
                    loss_logs.append(running_loss / log_modulo)
                    running_loss = 0.0

            # end of epoch update of learning rate scheduler
            scheduler.step(epoch + 1)

            # saving the model and the current loss after each epoch
            save_checkpoint(model_z, optimizer=opt)

            # validation of the model
            if epoch % val_modulo == val_modulo - 1:
                validation_id = str(int((epoch + 1) / val_modulo))

                # validation call
                predictions, labels, loss_val = predict(
                    model_z, val_loader, loss, **predict_params)
                loss_val_logs.append(loss_val)

                res = '\n[validation_id:' + validation_id + ']\n' + validate(
                    predictions,
                    labels,
                    validation_id=validation_id,
                    statistics=metrics_stats,
                    **validation_params)

                # save statistics for robust cross validation
                if metrics_stats:
                    metrics_stats.save()

                print_notification(res)

                if special_parameters.mail == 2:
                    send_email(
                        'Results for XP ' + special_parameters.setup_name +
                        ' (epoch: ' + str(epoch + 1) + ')', res)
                if special_parameters.file:
                    save_file(
                        validation_path,
                        'Results for XP ' + special_parameters.setup_name +
                        ' (epoch: ' + str(epoch + 1) + ')', res)

                # checkpoint
                save_checkpoint(model_z,
                                optimizer=opt,
                                validation_id=validation_id)

                # callback
                if vcallback is not None:
                    run_callbacks(vcallback, (epoch + 1) // val_modulo)

            # save loss
            save_loss(
                {  # // log_modulo * log_modulo in case log_modulo does not divide epoch_size
                    'train': (loss_logs, log_modulo),
                    'validation':
                    (loss_val_logs,
                     epoch_size // log_modulo * log_modulo * val_modulo)
                },
                ylabel=str(loss))

        # saving last epoch
        export_epoch(epoch +
                     1)  # if --restart is set, the train will not be executed

        # callback
        if vcallback is not None:
            finish_callbacks(vcallback)

    # final validation
    if special_parameters.evaluate or special_parameters.export:
        print_h1('Validation/Export: ' + special_parameters.setup_name)
        if metrics_stats is not None:
            # change the parameter states of the model to best model
            metrics_stats.switch_to_best_model()

        predictions, labels, val_loss = predict(model_z,
                                                test_loader,
                                                loss,
                                                validation_size=-1,
                                                **predict_params)

        if special_parameters.evaluate:

            res = validate(predictions,
                           labels,
                           statistics=metrics_stats,
                           **validation_params,
                           final=True)

            print_notification(res, end='')

            if special_parameters.mail >= 1:
                send_email(
                    'Final results for XP ' + special_parameters.setup_name,
                    res)
            if special_parameters.file:
                save_file(
                    validation_path,
                    'Final results for XP ' + special_parameters.setup_name,
                    res)

        if special_parameters.export:
            export_results(test_loader.dataset, predictions, **export_params)

    return metrics_stats
def _occurrence_loader(dataset_class,
                       occurrences,
                       validation_size=0.1,
                       test_size=0.1,
                       label_name='Label',
                       id_name='id',
                       splitter=train_test_split,
                       filters=tuple(),
                       online_filters=tuple(),
                       postprocessing=tuple(),
                       save_index='default',
                       limit=None,
                       source_name='unknown',
                       stop_filter=False,
                       **kwargs):
    """
    returns a train and a test set
    :type stop_filter: object
    :param source_name:
    :param postprocessing: post processing functions to apply on datasets
    :param limit:
    :param save_index: True, 'save' or False or 'load_and_save'
    :param online_filters:
    :param filters:
    :param splitter:
    :param rasters:
    :param id_name:
    :param label_name:
    :param validation_size:
    :param occurrences:
    :param dataset_class:
    :param test_size:
    :return: train, val and test set, pytorch ready
    """
    # initialize index to a specific behaviour if save index is default
    save_index = index_init(save_index, label_name)

    labels_indexed_bis = None

    # load an existing index
    if get_to_load(save_index):
        path = output_path('index.json')
        labels_indexed_bis = reverse_indexing(
            get_index(path))  # loading index and reversing it

    # or create index if failed or did not have to load one
    if labels_indexed_bis is None:
        # the test is for multi-labels
        labels_indexed_bis = {} if type(label_name) is not tuple else [
            {} for _ in label_name
        ]

    # do not load all the lines if their number is limited
    if limit is None:
        df = pd.read_csv(occurrences,
                         header='infer',
                         sep=';',
                         low_memory=False)
    else:
        df = pd.read_csv(occurrences,
                         header='infer',
                         sep=';',
                         low_memory=False,
                         nrows=limit)

    # filters unwanted occurrences
    df = df[df.apply(
        lambda _row: not online_filters_processing(online_filters, _row),
        axis=1)]

    # set label to -1 if no label or index label
    if label_name is None or not label_name:
        df['label'] = -1
    else:
        df['label'] = df[label_name].apply(
            lambda name: index_labels(labels_indexed_bis, name))

    ids = df[id_name].to_numpy()
    labels = df['label'].to_numpy()
    dataset = df[['Latitude', 'Longitude']].to_numpy()

    # if need to save index, save it
    if get_to_save(save_index):
        path = output_path('index.json')
        save_reversed_index(
            path, labels_indexed_bis)  # saving index after reversing it...

    columns = (labels, dataset, ids)
    # splitting train test
    train, test = perform_split(columns, test_size, splitter)

    # splitting validation
    train, val = perform_split(train, validation_size, splitter)

    # apply filters
    # for f in filters:  # TODO update filters taking into account the new structure
    #    f(*train, *val, *test)
    if test_size != 1 and label_name is not None and not stop_filter:
        # Filtering elements that are only in the test set
        test = filter_test((train[0], val[0]), *test)

    # train set
    train = dataset_class(*train, **kwargs)

    # test set
    test = dataset_class(*test, **kwargs)

    # validation set
    validation = dataset_class(*val, **kwargs)

    # apply special functions on datasets
    for process in postprocessing:
        process(train, validation, test)

    # print dataset statistics
    labels_size = labels_indexed_str(
        labels_indexed_bis) if label_name is not None else '0'
    print_dataset_statistics(len(train), len(validation), len(test),
                             source_name, labels_size)

    return train, validation, test
示例#27
0
 def __init__(self, max_top_k=100, final_validation=False):
     super().__init__(final_validation, True)
     self.file_name = output_path("_result_range_top" + str(max_top_k) +
                                  "_by_species.npy")
     self.max_top_k = max_top_k
     self.result = np.zeros(self.max_top_k)
示例#28
0
 def __call__(self, predictions, labels):
     np.save(output_path('predictions.npy'), predictions)
     return self.__str__()
示例#29
0
 def __str__(self):
     return "Predictions saved at \"" + output_path(
         'predictions.npy') + "\""
示例#30
0
def fit(model_z, game_class, game_params=None, training_params=None, predict_params=None, validation_params=None,
        export_params=None, optim_params=None):
    """
    This function is the core of an experiment. It performs the ml procedure as well as the call to validation.
    :param game_params:
    :param game_class:
    :param training_params: parameters for the training procedure
    :param optim_params:
    :param export_params:
    :param validation_params:
    :param predict_params:
    :param model_z: the model that should be trained
    """
    # configuration
    game_params, training_params, predict_params, validation_params, export_params, optim_params = merge_dict_set(
        game_params, GAME_PARAMS,
        training_params, TRAINING_PARAMS,
        predict_params, PREDICT_PARAMS,
        validation_params, VALIDATION_PARAMS,
        export_params, EXPORT_PARAMS,
        optim_params, OPTIM_PARAMS
    )

    validation_path = output_path('validation.txt')

    output_size = model_z.output_size if hasattr(model_z, 'output_size') else model_z.module.output_size

    # training parameters
    optim = optim_params.pop('optimizer')
    iterations = training_params.pop('iterations')
    gamma = training_params.pop('gamma')
    batch_size = training_params.pop('batch_size')
    loss = training_params.pop('loss')
    log_modulo = training_params.pop('log_modulo')
    val_modulo = training_params.pop('val_modulo')
    first_epoch = training_params.pop('first_epoch')
    rm_size = training_params.pop('rm_size')
    epsilon_start = training_params.pop('epsilon_start')
    epsilon_end = training_params.pop('epsilon_end')

    evaluate = special_parameters.evaluate
    # export = special_parameters.export
    do_train = special_parameters.train
    max_iterations = max(iterations)

    game = game_class(**game_params)

    replay_memory = ReplayMemory(rm_size)

    if do_train and first_epoch < max(iterations):
        print_h1('Training: ' + special_parameters.setup_name)

        state = unsqueeze(init_game(game, replay_memory, output_size, len(replay_memory)))
        memory_loader = torch.utils.data.DataLoader(
            replay_memory, shuffle=True, batch_size=batch_size,
            num_workers=16, drop_last=True
        )

        if batch_size > len(replay_memory):
            print_errors('Batch size is bigger than available memory...', do_exit=True)

        loss_logs = [] if first_epoch < 1 else load_loss('loss_train')

        loss_val_logs = [] if first_epoch < 1 else load_loss('loss_validation')

        rewards_logs = [] if first_epoch < 1 else load_loss('train_rewards')
        rewards_val_logs = [] if first_epoch < 1 else load_loss('val_rewards')

        epsilon_decrements = np.linspace(epsilon_start, epsilon_end, iterations[-1])

        opt = create_optimizer(model_z.parameters(), optim, optim_params)

        scheduler = MultiStepLR(opt, milestones=list(iterations), gamma=gamma)

        # number of batches in the ml
        epoch_size = len(replay_memory)

        # one log per epoch if value is -1
        log_modulo = epoch_size if log_modulo == -1 else log_modulo

        epoch = 0

        running_loss = 0.0
        running_reward = 0.0
        norm_opt = 0
        norm_exp = 0

        for epoch in range(max_iterations):

            if epoch < first_epoch:
                # opt.step()
                _skip_step(scheduler, epoch)
                continue
            # saving epoch to enable restart
            export_epoch(epoch)

            epsilon = epsilon_decrements[epoch]

            model_z.train()

            # printing new epoch
            print_h2('-' * 5 + ' Epoch ' + str(epoch + 1) + '/' + str(max_iterations) +
                     ' (lr: ' + str(scheduler.get_lr()) + ') ' + '-' * 5)

            for idx, data in enumerate(memory_loader):

                # the two Q-learning steps
                state, _, finish = _exploration(model_z, state, epsilon, game, replay_memory, output_size)

                if finish:
                    # if the game is finished, we save the score
                    running_reward += game.score_
                    norm_exp += 1
                # zero the parameter gradients

                running_loss += _optimization(model_z, data, gamma, opt, loss)
                norm_opt += 1

            if epoch % log_modulo == log_modulo - 1:
                print('[%d, %5d]\tloss: %.5f' % (epoch + 1, idx + 1, running_loss / log_modulo))
                print('\t\t reward: %.5f' % (running_reward / log_modulo))
                loss_logs.append(running_loss / log_modulo)
                rewards_logs.append(running_reward / log_modulo)
                running_loss = 0.0
                running_reward = 0.0
                norm_opt = 0
                norm_exp = 0

            # end of epoch update of learning rate scheduler
            scheduler.step(epoch + 1)

            # saving the model and the current loss after each epoch
            save_checkpoint(model_z, optimizer=opt)

            # validation of the model
            if epoch % val_modulo == val_modulo - 1:
                validation_id = str(int((epoch + 1) / val_modulo))

                # validation call
                loss_val = play(model_z, output_size, game_class, game_params, 1)

                loss_val_logs.append(loss_val)

                res = '\n[validation_id:' + validation_id + ']\n' + str(loss_val)

                print_notification(res)

                if special_parameters.mail == 2:
                    send_email('Results for XP ' + special_parameters.setup_name + ' (epoch: ' + str(epoch + 1) + ')',
                               res)
                if special_parameters.file:
                    save_file(validation_path, 'Results for XP ' + special_parameters.setup_name +
                              ' (epoch: ' + str(epoch + 1) + ')', res)

                # checkpoint
                save_checkpoint(model_z, optimizer=opt, validation_id=validation_id)

            # save loss
            save_loss(
                {  # // log_modulo * log_modulo in case log_modulo does not divide epoch_size
                    'train': (loss_logs, log_modulo),
                    # 'validation': (loss_val_logs, val_modulo)
                },
                ylabel=str(loss)
            )

        # saving last epoch
        export_epoch(epoch + 1)  # if --restart is set, the train will not be executed

    # final validation
    print_h1('Validation/Export: ' + special_parameters.setup_name)
    if evaluate:
        loss_val = play(model_z, output_size, game_class, game_params, 500)

        res = '' + loss_val

        print_notification(res, end='')

        if special_parameters.mail >= 1:
            send_email('Final results for XP ' + special_parameters.setup_name, res)
        if special_parameters.file:
            save_file(validation_path, 'Final results for XP ' + special_parameters.setup_name, res)