def initial_call(self, modulo, nb_calls, dataset, model): print_info('Init Callback: ' + str(self)) self.modulo = modulo self.nb_calls = nb_calls if self.dataset is None: self.dataset = dataset self.model = model.module if type(model) is DataParallel else model
def do_extraction(dataset, labels_index, file_name='representation_tsne'): representation, colors, labels = extract_representation(dataset, model, labels_index=labels_index) representation_embedded = TSNE(n_components=2).fit_transform(representation) zipped = list(zip(representation_embedded, colors, labels)) zipped.sort(key=lambda tup: tup[2]) c = zipped[0][2] artists = [] col, rep = [], [] artists.append((rep, col, c)) for row in zipped: if row[2] != c: col, rep = [], [] c = row[2] artists.append((rep, col, c)) col.append(row[1]) rep.append(row[0]) # converting to numpy for i in range(len(artists)): artists[i] = (np.array(artists[i][0]), np.array(artists[i][1]), artists[i][2]) path = output_path(file_name + '.dump') with open(path, 'wb') as f: pickle.dump(artists, f) print_info('Representation saved at: ' + path)
def create_model(model_class, model_params=None, model_name='model'): """ create and eventually load model :param model_name: :param model_class: :param model_params: :param model_name: :return: """ model_params = {} if model_params is None else model_params model = model_class(**model_params) if special_parameters.load_model: # recover from checkpoint _load_model(model, model_name) # configure usage on GPU if use_gpu(): model.to(first_device()) model = torch.nn.DataParallel(model, device_ids=all_devices()) # print info about devices print_info('Device(s)): ' + str(device_description())) return model
def create_ign_sparse(source_occ, source_ign, patch_size=64, error_path=output_path("error_extract/"), **kwargs): r = check_source(source_occ) occurrences = r['occurrences'] r = check_source(source_ign) ign_images = r['maps'] la93 = Proj(init='epsg:2154') # extract manager im_manager = IGNImageManager(ign_images) extract_size = patch_size extract_step = 1 # loading the occurrence file df = pd.read_csv(occurrences, header='infer', sep=';', low_memory=False) max_lat = df['Latitude'].max() print(max_lat) # sorting the dataset to optimise the extraction df.sort_values('Latitude', inplace=True) print_info(str(len(df)) + ' occurrences to extract!')
def plot_corr(model, X, absolute_value, threshold, figure_name='pcr'): activations, _ = get_activations(model, X) activations, _ = np.unique(activations, return_inverse=True, axis=0) min_activations = 10 # partition that are not on the domain are removed. for i, v in enumerate(np.all(activations == activations[0, :], axis=0)): if v: activations[:, i] = 0 # unique after corrections activations, _ = np.unique(activations, return_inverse=True, axis=0) nb_activations = activations.shape[0] nb_c_activations = min(nb_activations, min_activations) nb_params = -1 for n, p in model.named_parameters(): if len(p.shape) == 2: nb_params += 1 plt(figure_name, figsize=(nb_c_activations * 6.4, nb_params * 4.8)) vmin = 0. if absolute_value else -1. vmax = 1. print_info(str(nb_activations) + ' affine spaces used') for idx, a in enumerate(range(nb_c_activations)): tc = 0 c = 0 for name, params in model.named_parameters(): if len(params.shape) == 2: A = params.detach().numpy() B = np.zeros((A.shape[0], A.shape[0])) plt(figure_name).subplot(nb_params, nb_c_activations, 1 + c * nb_c_activations + idx) for i in range(A.shape[0]): for j in range(A.shape[0]): if activations[idx, tc + i] != 0 and activations[idx, tc + j] != 0: B[i, j] = np.dot(A[i, :], A[j, :]) / np.linalg.norm( A[i, :]) / np.linalg.norm(A[j, :]) else: B[i, j] = None if absolute_value: B = np.abs(B) if type(threshold) is not bool: B = (B > threshold).astype(int) plt(figure_name).imshow(B, vmin=vmin, vmax=vmax) plt(figure_name).title(name) plt(figure_name).colorbar() tc += A.shape[0] c += 1 if tc >= activations.shape[1]: break
def save_classifier_weight(model): w = model.state_dict()['fc.weight'] w = w.numpy() print(w) print(type(w)) print_info("save weight") result_path = output_path('weight.npy') np.save(result_path, w) print_info("saved !")
def fit(train, test, export=False, training_params=None, export_params=None, **kwargs): if not use_gpu(): print_errors('XGBoost can only be executed on a GPU for the moment', do_exit=True) training_params = {} if training_params is None else training_params export_params = {} if export_params is None else export_params d_test = xgb.DMatrix(np.asarray(test.get_vectors()), label=np.asarray(test.labels)) if not validation_only: print_h1('Training: ' + special_parameters.setup_name) print_info("get vectors...") X = np.asarray(train.get_vectors()) y = np.asarray(train.labels) d_train = xgb.DMatrix(X, label=y) gpu_id = first_device().index kwargs['verbosity'] = verbose_level() kwargs['gpu_id'] = gpu_id eval_list = [(d_test, 'eval'), (d_train, 'train')] print_info("fit model...") bst = xgb.train(kwargs, d_train, num_boost_round=kwargs["num_boost_round"], verbose_eval=kwargs["verbose_eval"], evals=eval_list) save_model(bst) else: bst = load_model() print_h1('Validation/Export: ' + special_parameters.setup_name) predictions = bst.predict(d_test, ntree_limit=bst.best_ntree_limit) res = validate(predictions, np.array(test.labels), training_params['metrics'] if 'metrics' in training_params else tuple(), final=True) print_notification(res, end='') if export: export_results(test, predictions, **export_params)
def check_machine(): """ execute some commands to print specific information about the machine. """ # list of commands to be executed commands = ('env', 'module list', 'pwd', 'hostname') for c in commands: print(('[' + c + ']' + ' ' + '*' * 80)[:80]) print_info(os.popen(c).read())
def last_call(self): step = 0.005 x = np.arange(-1, 1. + step, step) y = np.sqrt(np.maximum(1. - x**2, np.zeros(x.shape))) plt('circle').plot(x, y) y = -np.sqrt(np.maximum(1. - x**2, np.zeros(x.shape))) plt('circle').plot(x, y) labels = self.dataset.labels dataset = self.dataset.dataset plt('circle').scatter(dataset[labels == 0][:, 0], dataset[labels == 0][:, 1]) plt('circle').scatter(dataset[labels == 1][:, 0], dataset[labels == 1][:, 1]) for i, p in enumerate(self.parameters[0]): norm = np.sqrt(p[0]**2 + p[1]**2) if norm > self.coef_norm: self.coef_norm = norm for i, p in enumerate(self.parameters[0]): p /= self.coef_norm norm = np.sqrt(p[0]**2 + p[1]**2) new_norm = norm * self.wk[0][i] if self.use_wk else norm b = -self.bias[0][i] if self.use_bias else 0. b /= norm dx, dy = p[0] * new_norm / norm, p[1] * new_norm / norm x, y = (0, 0) if not self.use_bias else (p[0] * b / norm, p[1] * b / norm) self.arrows.append( plt('circle').arrow(x, y, dx, dy, shape='full', head_width=0.04, head_length=0.08)) fig = get_figure('circle') self.axis = fig.gca() anim = FuncAnimation(fig, self.update, frames=np.arange(0, len(self.parameters)), interval=200) path = output_path('circle.gif') print_info('Saving GIF at ' + path) anim.save(path, dpi=80, writer='imagemagick') delete_figure('circle')
def detect_machine(): hostname = socket.gethostname() found_machine = False for k, v in clusters.items(): for h in v: if hostname.startswith(h): special_parameters.machine = k found_machine = True break if found_machine: break if not found_machine: special_parameters.machine = hostname print_info('The machine was identified as ' + special_parameters.machine)
def plot_occurrences(train, val, test): # df_train = pd.read_csv("/home/bdeneu/data/occurrences_glc18.csv", header='infer', sep=';', low_memory=False) # df_test = pd.read_csv("/home/bdeneu/data/occurrences_glc18_test_withlabel.csv", header='infer', sep=';', low_memory=False) # d_train = df_train[['Latitude', 'Longitude']].to_numpy() # d_test = df_test[['Latitude', 'Longitude']].to_numpy() d_train = np.asarray(train.dataset) d_test = np.asarray(test.dataset) d_val = np.asarray(val.dataset) geo_tr = project(d_train[:, 0], d_train[:, 1]) #geo_te = project(d_test[:, 0], d_test[:, 1]) #geo_va = project(d_val[:, 0], d_val[:, 1]) #print(geo_te) s = 0.8 plt.style.use('classic') fig, ax = plt.subplots() #ax.scatter(geo_tr[0][:], geo_tr[1][:], color='#00cc99', marker='s', s=s, label="train") ax.scatter(geo_tr[0][:], geo_tr[1][:], color='#93c47d', marker='s', s=s, label="train") #ax.scatter(geo_va[0][:], geo_va[1][:], color='#33ff33', marker='s', s=s, label="val") #ax.scatter(geo_te[0][:], geo_te[1][:], color='#d9ff66', marker='s', s=s, label="test") # ax = fig.add_subplot(111, axisbg='white') ax.set_xlim(3200, 4400) ax.set_ylim(2000, 3200) ax.spines['bottom'].set_color('#dddddd') ax.spines['top'].set_color('#dddddd') ax.spines['right'].set_color('#dddddd') ax.spines['left'].set_color('#dddddd') ax.tick_params(axis='x', colors='#dddddd') ax.tick_params(axis='y', colors='#dddddd') ax.yaxis.label.set_color('#dddddd') ax.xaxis.label.set_color('#dddddd') ax.title.set_color('#dddddd') #plt.legend(loc=1, markerscale=0.8, facecolor='#00FFFFFF') print("here") plt.show() print_info('figure saved at: ' + output_path('occurrences.png')) fig.savefig(output_path('occurrences.png'), transparent=True)
def extract_7z(source, extension='.7z'): # loading a specific source r = check_source(source) dir_name = r['archive'] dest_name = r['maps'] os.chdir(dir_name) # change directory from working dir to dir with files n = len(os.listdir(dir_name)) for i, item in enumerate( os.listdir(dir_name)): # loop through items in dir print_info( '\n------------------------------------------------------------------------------' ) print_info(str(i + 1) + '/' + str(n)) if item.endswith( extension): # check for ".zip" or ".7z", etc. extension file_name = os.path.abspath(item) # get full path of files print_h2(file_name) print_info('\n') os.system('7z x ' + file_name + ' -o' + dest_name)
def get_species_neurons_correlations(): activations = np.load(output_path('activations.npy')) logits = np.load(output_path('logits.npy')) print_info("calculate correlation matrix between features and species") mean_act = np.mean(activations, axis=0) std_act = np.std(activations, axis=0) norm_act = (activations - mean_act) / std_act mean_log = np.mean(logits, axis=0) std_log = np.std(logits, axis=0) norm_log = (logits - mean_log) / std_log size = activations.shape[0] * activations.shape[1] c = size - np.count_nonzero(activations) print(str(c) + "/" + str(size) + " (" + str(c * 100.0 / size) + "%)") matrix = np.zeros((activations.shape[1], logits.shape[1]), dtype=float) for i in progressbar.progressbar(range(activations.shape[0])): act = norm_act[i] log = norm_log[i] for j in range(norm_act.shape[1]): matrix[j] += (log * act[j]) / activations.shape[0] result_path = output_path('correlation_activations.npy') print_info("save activations for species:", result_path) np.save(result_path, matrix) print_info("saved !")
def print_model_parameters(model): for name, param in model.named_parameters(): print_info(name + ' ' + str(param.shape)) print_info('\n' + '*' * 50 + '\n') for name, param in model.named_parameters(): print_info(name + ' *' * 10 + '\n' + str(param.data.detach().numpy()).replace('array', ''))
def export_bigdata(model, test, batch_size, buffer_size, size): num_workers = special_parameters.nb_workers test_loader = torch.utils.data.DataLoader(test, shuffle=False, batch_size=batch_size, num_workers=num_workers) results = [] model.eval() export_path = output_path('predictions.csv') # check if labels have been indexed index_path = output_path('index.json') indexed_labels = get_index(index_path) with open(export_path, 'w') as f: print_info('Exporting predictions at ' + export_path) f.write('id,class_id,rank,proba\n') # header warnings.simplefilter( 'ignore') # warning because old import in progressbar bar = progressbar.ProgressBar(max_value=len(test_loader)) warnings.simplefilter('default') for idx, data in enumerate(test_loader): # get the inputs inputs, labels = data outputs = model(inputs) results.append(outputs.detach().cpu().numpy()) if len(results) >= buffer_size: _export_bigdata(f, results, test, indexed_labels, size) results = [] bar.update(idx) if len(results) >= 0: _export_bigdata(f, results, test, indexed_labels, size) bar.finish()
def extract_patch(source, offset=0, check_file=True): """ Extract IGN patch from IGN maps. :param source: :param offset: :param check_file: :return: """ # checking the source r = check_source(source) # extract manager im_manager = IGNImageManager(r['maps']) extract_size = 64 extract_step = 1 # loading the occurrence file df = pd.read_csv(r['occurrences'], header='infer', sep=';', low_memory=False) # sorting the dataset to optimise the extraction df.sort_values('Latitude', inplace=True) # offset management df = df.iloc[offset:] print_info(str(len(df)) + ' occurrences to extract!') im_manager.extract_patches( df[[r['longitude'], r['latitude'], r['id_name']]], r['patches'], size=extract_size, step=extract_step, check_file=check_file)
def wrapper(*args, **kwargs): start = time.time() print_info('[Executing ' + func.__name__ + ']') # check changeable parameters (command line and more) if func.__name__ in hp.overriding_parameters(): for arg, name in zip(args, func.__code__.co_varnames): kwargs[name] = arg args = tuple() merge(kwargs, hp.overriding_parameters()[func.__name__]) if len(args) > 0 or len(kwargs) > 0: add_config_elements('[' + func.__name__ + ']') if len(args) > 0: print_info('Args: ' + format_dict_and_tuple(args)) add_config_elements('Args: ' + format_dict_and_tuple(args)) if len(kwargs) > 0: print_info('Kwargs: ' + format_dict_and_tuple(kwargs)) add_config_elements('Kwargs: ' + format_dict_and_tuple(kwargs)) results = func(*args, **kwargs) print_durations(time.time() - start) return results
def compute_neural_directions(model, X, absolute_value, threshold, min_activations=10): # this method only works on fully connected models if type(model) is not fully_connected.Net: print_errors(str(type(model)) + ' must be of type ' + str(fully_connected.Net) + '.', do_exit=True) layers = [m for m in model.modules() if type(m) in (BatchNorm1d, Linear)][:-1] final_layers = [] it = 0 while it < len(layers): # linear layer M = layers[it] it += 1 linear_app = M.weight.detach().cpu().numpy() if it < len(layers) and type(layers[it]) is BatchNorm1d: A = layers[it] var = np.diag(A.running_var.cpu().numpy()) gamma = np.diag(A.weight.detach().cpu().numpy()) bn = np.matmul(gamma, np.linalg.inv(var)) linear_app = np.matmul(bn, linear_app) it += 1 final_layers.append(linear_app) # get activations activations, _ = get_activations(model, X) activations, _ = np.unique(activations, return_inverse=True, axis=0) # partitions where change is not on the domain are removed. for i, v in enumerate(np.all(activations == activations[0, :], axis=0)): if v: activations[:, i] = 0 # unique after corrections activations, _ = np.unique(activations, return_inverse=True, axis=0) vmin = 0. if absolute_value else -1. vmax = 1. vectors = [[] for _ in range(len(final_layers))] n_act = min(min_activations, len(activations)) print_info("n_act: %d" % n_act) for i in range(n_act): la = None for li, l in enumerate(final_layers): activated = activations[i][li * l.shape[0]:(li + 1) * l.shape[0]] if la is None: la = final_layers[li] * activated[:, np.newaxis] else: la = np.matmul(final_layers[li], la) * activated[:, np.newaxis] for n in la: vectors[li].append(n) continue return vectors, vmin, vmax
from datascience.visu.patch import pplot_patch import numpy as np # with option --more idx=12 to change the index from the command line... from engine.logging import print_info from engine.parameters.special_parameters import get_parameters # load the idx + 1 first elements idx = get_parameters('idx', 0) train, _, _ = occurrence_loader(EnvironmentalIGNDataset, source='full_ign', id_name='X_key', label_name='glc19SpId', validation_size=0, test_size=0, limit=idx + 1) patch, _ = train[idx] patch = [l.int() for l in patch] patch = patch[:-3] + [np.transpose(np.stack(patch[-3:], axis=0), (1, 2, 0))] print_info('Printing patch at ' + str(train.dataset[idx])) pplot_patch(patch, header=train.named_dimensions) save_fig()
def get_species_neurons_activations(model, grid_points, batch_size=32): activations = predict_grid(model, grid_points, batch_size=batch_size, features_activation=True) predictions = predict_grid(model, grid_points, batch_size=batch_size) logits = predict_grid(model, grid_points, batch_size=batch_size, logit=True) result_path = output_path('activations.npy') print_info("save activations:", result_path) np.save(result_path, activations) result_path = output_path('predictions.npy') print_info("save predictions:", result_path) np.save(result_path, predictions) result_path = output_path('logits.npy') print_info("save logits", result_path) np.save(result_path, logits) print_info("saved !") print_info("save weight") w = model.state_dict()['fc.weight'] w = w.numpy() result_path = output_path('weight.npy') np.save(result_path, w) print_info("saved !")
def predict(model, loader, loss, export=False, filters=tuple(), validation_size=10000, compute_loss=False): """ Give the prediction of the model on a test set :param compute_loss: :param filters: set some output to 0 :param validation_size: :param model: the model :param loader: the test set loader :param loss: the loss function :param export: if False the predictions are not saved, otherwise the results are exported on file. if export is true the loader must not be shuffled... :return: the arrays of predictions and corresponding labels """ if len(loader) > _memory_overflow_size and ( validation_size == -1 or validation_size > _memory_overflow_size): print_warning( '[predict] The dataset size is {}. Large datasets can cause memory ' 'overflow during standard prediction...'.format(len(loader))) with torch.no_grad(): total = 0 model.eval() y_preds = [] y_labels = [] running_loss = 0.0 idx = 0 if hasattr(model, 'last_sigmoid') and compute_loss: model.last_sigmoid = False elif hasattr(model, 'last_sigmoid'): model.last_sigmoid = True for idx, data in enumerate(loader): inputs, labels = data if use_gpu(): labels = labels.cuda() # wrap them in Variable labels_variable = loss.output(labels) outputs = model(inputs) # if not test set if compute_loss and labels[0] != -1: loss_value = loss(outputs, labels) running_loss += loss_value.item() outputs = loss.output(outputs) total += labels_variable.size(0) y_preds.extend(outputs.data.tolist()) y_labels.extend(labels_variable.data.tolist()) if total >= validation_size != -1 and not export: break running_loss /= (idx + 1) # normalizing the loss if compute_loss: print_info('Validation loss: ' + str(running_loss)) add_scalar('Loss/Validation', running_loss) predictions, labels = np.asarray(y_preds), np.asarray(y_labels) # filtering some predicted labels for f in filters: f(predictions) # TODO filtering official labels return predictions, labels, running_loss
def plot_on_map(activations, map_ids, n_cols=1, n_rows=1, figsize=4, log_scale=False, mean_size=1, selected=tuple(), legend=None, output="activations", style="grey", exp_scale=False, cmap=None, alpha=None, bad_alpha=1., font_size=12, color_text='black', color_tick='black'): if log_scale: print_info("apply log...") activations = activations + 1.0 activations = np.log(activations) elif exp_scale: print_info("apply exp...") p = np.full(activations.shape, 1.2) activations = np.power(p, activations) print_info("construct array activation map...") pos = [] max_x = 0 max_y = 0 for id_ in map_ids: x, y = id_.split("_") x, y = int(x), int(y) pos.append((x, y)) if x > max_x: max_x = x if y > max_y: max_y = y size = max(max_x + 1, max_y + 1) while size % mean_size != 0: size += 1 nb = n_cols * n_rows act_map = np.ndarray((nb, size, size)) act_map[:] = np.nan print_info("select neurons to print...") if len(selected) > 0: list_select = selected else: list_select = random.sample(list(range(activations.shape[1])), nb) print_info("fill activation map array...") for k, act in enumerate(activations): for idx, j in enumerate(list_select): x, y = pos[k][0], pos[k][1] act_map[idx, x, y] = act[j] """ fig, axs = plt.subplots(n_rows, n_cols, sharex='col', sharey='row', figsize=(n_cols*figsize*1.2, n_rows*figsize)) fig.subplots_adjust(wspace=0.5) plt.tight_layout(pad=1.5) print_info("make figure...") for j in range(nb): if mean_size != 1: height, width = act_map[j].shape act_map_j = np.ma.average(np.split(np.ma.average(np.split(act_map[j], width // mean_size, axis=1), axis=2), height // mean_size, axis=1), axis=2) else: act_map_j = act_map[j] masked_array = np.ma.array(act_map_j, mask=np.isnan(act_map_j)) cmap = matplotlib.cm.inferno cmap.set_bad('grey', 1.) im = axs[j // n_cols, j % n_cols].imshow(masked_array, cmap=cmap, interpolation='none') axs[j // n_cols, j % n_cols].set_title(str(list_select[j])) divider = make_axes_locatable(axs[j // n_cols, j % n_cols]) cax = divider.append_axes("right", size="5%", pad=0.05) fig.colorbar(im, cax=cax) """ font = {'family': 'normal', 'weight': 'bold', 'size': font_size} matplotlib.rc('font', **font) if legend is None: legend = list(map(str, list_select)) mplt.rcParams['text.color'] = color_text mplt.rcParams['axes.labelcolor'] = color_tick mplt.rcParams['xtick.color'] = color_tick mplt.rcParams['ytick.color'] = color_tick plt(output, figsize=(n_cols * figsize * 1.2, n_rows * figsize)) fig = get_figure(output) fig.subplots_adjust(wspace=0.05) print_info("make figure...") for j in range(nb): if mean_size != 1: height, width = act_map[j].shape act_map_j = np.nanmean(np.split(np.nanmean(np.split(act_map[j], width // mean_size, axis=1), axis=2), height // mean_size, axis=1), axis=2) else: act_map_j = act_map[j] print(act_map_j[2, 572]) masked_array = np.ma.array(act_map_j, mask=np.isnan(act_map_j)) if cmap is None: if style == "grey": cmap = matplotlib.cm.inferno cmap.set_bad('grey', bad_alpha) elif style == "white": cmap = matplotlib.cm.inferno cmap.set_bad('white', bad_alpha) ax = plt(output).subplot(n_rows, n_cols, j + 1) ax.set_facecolor((0, 0, 0, 0)) im = plt(output).imshow(masked_array, cmap=cmap, interpolation='none', alpha=alpha) plt(output).title(legend[j]) divider = make_axes_locatable(ax) cax = divider.append_axes("right", size="5%", pad=0.05) plt(output).colorbar(im, cax=cax) fig.tight_layout(pad=0.05) fig.patch.set_alpha(0.0) save_fig(figure_name=output, extension='png')
def export_config(): path = output_path('config.txt') print_info('Writing config at: ' + path) with open(path, 'a') as f: f.write(' '.join(sys.argv) + '\n')
from datascience.data.util.source_management import check_source import json import pandas as pd from engine.logging import print_info source = check_source('glc20') raw_occurrences_path = source['raw_source'] occurrences_path = source['occurrences'] # destination with open(raw_occurrences_path, 'rb') as f: d = json.load(f) data = {'id': [], 'lat': [], 'lon': [], 'species_id': [], 'species_name': []} for row in d: if row['results']['status'] == 'BEST_REF': data['id'].append(row['id']) data['lat'].append(row['lat']) data['lon'].append(row['lon']) data['species_id'].append(row['results']['id']) data['species_name'].append(row['results']['name']) df = pd.DataFrame(data=data) print_info('Saving file') df.to_csv(occurrences_path, header=True, sep=';', index=False)
def _save_fig(path_name, figure): print_info('Saving figure at: ' + path_name) figure.savefig(path_name)
def check_extraction(source, save_errors=True, save_filtered=True, id_name='X_key'): """ check if all patches from an occurrences file have been extracted. Can save the list of errors and filtered the dataset keeping the correctly extracted data. :param id_name: the column that contains the patch id that will be used to construct its path :param save_filtered: save the dataframe filtered from the error :param save_errors: save the errors found in a file :param source: the source referring the occurrence file and the patches path """ # retrieve details of the source r = check_source(source) if 'occurrences' not in r or 'patches' not in r: print_errors( 'Only sources with occurrences and patches can be checked', do_exit=True) df = pd.read_csv(r['occurrences'], header='infer', sep=';', low_memory=False) nb_errors = 0 errors = [] for idx, row in progressbar.progressbar(enumerate(df.iterrows())): patch_id = str(int(row[1][id_name])) # constructing the path of a patch given its id path = os.path.join(r['patches'], patch_id[-2:], patch_id[-4:-2], patch_id + '.npy') # if the path does not correspond to a file, then it's an error if not os.path.isfile(path): errors.append(row[1][id_name]) nb_errors += 1 if nb_errors > 0: # summary of the error print_info(str(nb_errors) + ' errors found during the check...') if save_errors: # filter the dataframe using the errors df_errors = df[df[id_name].isin(errors)] error_path = output_path('_errors.csv') print_info('Saving error file at: ' + error_path) # save dataframe to the error file df_errors.to_csv(error_path, header=True, index=False, sep=';') if save_filtered: # filter the dataframe keeping the non errors df_filtered = df[~df[id_name].isin(errors)] filtered_path = r['occurrences'] + '.tmp' print_info('Saving filtered dataset at: ' + filtered_path) df_filtered.to_csv(filtered_path, header=True, index=False, sep=';') else: print_info('No error has been found!')