def create_ign_sparse(source_occ, source_ign, patch_size=64, error_path=output_path("error_extract/"), **kwargs): r = check_source(source_occ) occurrences = r['occurrences'] r = check_source(source_ign) ign_images = r['maps'] la93 = Proj(init='epsg:2154') # extract manager im_manager = IGNImageManager(ign_images) extract_size = patch_size extract_step = 1 # loading the occurrence file df = pd.read_csv(occurrences, header='infer', sep=';', low_memory=False) max_lat = df['Latitude'].max() print(max_lat) # sorting the dataset to optimise the extraction df.sort_values('Latitude', inplace=True) print_info(str(len(df)) + ' occurrences to extract!')
def pplot(latitude, longitude, source, resolution=1., style=special_parameters.plt_style, nb_cols=5, alpha=1.): """ patch plot :param style: :param latitude: :param longitude: :param source: :param resolution: :return: """ r = check_source(source) rasters = r['rasters'] extractor = PatchExtractor(rasters, resolution=resolution) extractor.add_all() extractor.plot(item=(latitude, longitude), return_fig=True, style=style, nb_cols=nb_cols, alpha=alpha)
def __init__(self, source, transform=None, input_size=299): r = check_source(source) self.source = source path = r['path'] if transform is None: self.train_transform = transforms.Compose([ transforms.RandomResizedCrop(input_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) self.test_transform = transforms.Compose([ transforms.Resize(input_size), transforms.CenterCrop(input_size), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) else: self.transform = transform dataset = _load_dataset(path) random.shuffle(dataset) self.country = [] self.painter = [] self.type = [] self.path = [] for row in dataset: self.country.append(row[0]) self.painter.append(row[1]) self.type.append(row[2]) self.path.append(os.path.join(*row))
def extract_7z(source, extension='.7z'): # loading a specific source r = check_source(source) dir_name = r['archive'] dest_name = r['maps'] os.chdir(dir_name) # change directory from working dir to dir with files n = len(os.listdir(dir_name)) for i, item in enumerate( os.listdir(dir_name)): # loop through items in dir print_info( '\n------------------------------------------------------------------------------' ) print_info(str(i + 1) + '/' + str(n)) if item.endswith( extension): # check for ".zip" or ".7z", etc. extension file_name = os.path.abspath(item) # get full path of files print_h2(file_name) print_info('\n') os.system('7z x ' + file_name + ' -o' + dest_name)
def __init__(self, source, nb_try_max=1000, islands_sup=0, close_target=False, auto_restart=True): """ :param root_dir: the root dir of the grib files :param polar: the polar path to the file :param nb_try_max: the number of allowed tries """ super().__init__() self.autorestart = auto_restart r = check_source(source) if 'path' not in r: print_errors('The source ' + source + ' does not contain path', do_exit=True) if 'polar' not in r: print_errors('The source ' + source + ' does not contain polar', do_exit=True) self.root_dir = r['path'] self.game = None self.numpy_grib = None self.polar = Polar(path_polar_file=r['polar']) self.target = None self.position = None self.start_position = None self.grib_list = [file for file in os.listdir(self.root_dir) if file.endswith('.npz')] self.start_timestamp = None self.timedelta = None self.track = None self.score = 0 self.score_ = 0 self.nb_try = 0 self.nb_try_max = nb_try_max self.dist = None self.old_dist = None self.dir = None self.sog = None self.cog = None self.twa = None self.tws = None self.twd = None self.close_target = close_target self.islands_sup = islands_sup self.bins = np.array([i * 45 for i in range(8)]) self.start()
def raster_characteristics(source): """ print infos about the rasters :param source: :return: """ r = check_source(source) rasters = r['rasters'] extractor = PatchExtractor(rasters) extractor.add_all() print_statistics(str(extractor))
def occurrence_loader(dataset_class, source=None, validation_size=0.1, test_size=0.1, splitter=train_test_split, filters=tuple(), online_filters=tuple(), postprocessing=tuple(), save_index='default', limit=None, **kwargs): """ Load an occurrence dataset. :param dataset_class: the type of dataset (with rasters or not, etc.) :param source: the source name :param validation_size: [0, 1] :param test_size: [0, 1] :param splitter: the train test split. By default train_test_split from sklearn :param filters: post filters :param online_filters: filters that are applied when loading the data :param postprocessing: additional transformations :param save_index: load, save, default, load_and_save, auto :param limit: the number of elements to load :param kwargs: :return: train, validation, test sets """ if source is not None: r = check_source(source) merge_smooth(kwargs, r) return _occurrence_loader(dataset_class, validation_size=validation_size, test_size=test_size, splitter=splitter, filters=filters, online_filters=online_filters, postprocessing=postprocessing, save_index=save_index, limit=limit, **kwargs)
def extract_patch(source, offset=0, check_file=True): """ Extract IGN patch from IGN maps. :param source: :param offset: :param check_file: :return: """ # checking the source r = check_source(source) # extract manager im_manager = IGNImageManager(r['maps']) extract_size = 64 extract_step = 1 # loading the occurrence file df = pd.read_csv(r['occurrences'], header='infer', sep=';', low_memory=False) # sorting the dataset to optimise the extraction df.sort_values('Latitude', inplace=True) # offset management df = df.iloc[offset:] print_info(str(len(df)) + ' occurrences to extract!') im_manager.extract_patches( df[[r['longitude'], r['latitude'], r['id_name']]], r['patches'], size=extract_size, step=extract_step, check_file=check_file)
def load_multitask_bernoulli_dataset(source, test_size=0.1, val_size=0.1, transform=None, splitter=train_test_split): r = check_source(source) path = r['path'] classes_index = {} labels_index = {} labels = [] classes = [] images = [] for c in os.listdir(path): classes_index[c] = len(classes_index) # to index the classes path_class = os.path.join(path, c) for label in os.listdir(path_class): path_label = os.path.join(path_class, label) # it is a bernoulli task and there must be only two labels: positive and negative # the name should be shared among classes if label not in labels_index: if len(labels_index) >= 2: raise PosNegLabelException( 'All positive and negative labels folder must have the same name...' ) if 'pos' in label.lower(): labels_index[label] = 1 elif 'neg' in label.lower(): labels_index[label] = 0 else: labels_index[label] = len(labels_index) for image in os.listdir(path_label): labels.append( (label, labels_index[label])) # label name, label ID classes.append((c, classes_index[c])) # class name, class ID images.append(os.path.join(path_label, image)) # image path dataset = (labels, classes, images) # dataset split train, test = perform_split(dataset, test_size, splitter) train, val = perform_split(train, val_size, splitter) if transform is None: transform = { 'train': transforms.Compose([transforms.ToTensor()]), 'test': transforms.Compose([transforms.ToTensor()]) } train = ImageDatasetMTBernoulli(train, len(classes_index), transform=transform['train']) val = ImageDatasetMTBernoulli(val, len(classes_index), transform=transform['test']) test = ImageDatasetMTBernoulli(test, len(classes_index), transform=transform['test']) print_dataset_statistics(len(train), len(val), len(test), source, len(classes_index)) return train, val, test, len(classes_index)
from datascience.data.datasets.dataset_simple import GeoLifeClefDataset from datascience.tools.activations_map.plot_activations_maps import plot_species_on_map from datascience.data.util.source_management import check_source from engine.parameters.special_parameters import get_parameters species = get_parameters('species', 0) mean_size = get_parameters('mean_size', 1) figsize = get_parameters('figsize', 5) log_scale = get_parameters('log_scale', False) softmax = get_parameters('softmax', False) alpha = get_parameters('alpha', None) # loading dataset _, _, grid_points = occurrence_loader(GeoLifeClefDataset, source='grid_occs_1km', id_name='id', test_size=1, label_name=None) sources = check_source('gbif_taxref') # get activations plot_species_on_map(grid_points, label_species=sources['label_species'], species=species, mean_size=mean_size, figsize=figsize, log_scale=log_scale, softmax=softmax, alpha=alpha)
from datascience.data.util.source_management import check_source import json import pandas as pd from engine.logging import print_info source = check_source('glc20') raw_occurrences_path = source['raw_source'] occurrences_path = source['occurrences'] # destination with open(raw_occurrences_path, 'rb') as f: d = json.load(f) data = {'id': [], 'lat': [], 'lon': [], 'species_id': [], 'species_name': []} for row in d: if row['results']['status'] == 'BEST_REF': data['id'].append(row['id']) data['lat'].append(row['lat']) data['lon'].append(row['lon']) data['species_id'].append(row['results']['id']) data['species_name'].append(row['results']['name']) df = pd.DataFrame(data=data) print_info('Saving file') df.to_csv(occurrences_path, header=True, sep=';', index=False)
def check_extraction(source, save_errors=True, save_filtered=True, id_name='X_key'): """ check if all patches from an occurrences file have been extracted. Can save the list of errors and filtered the dataset keeping the correctly extracted data. :param id_name: the column that contains the patch id that will be used to construct its path :param save_filtered: save the dataframe filtered from the error :param save_errors: save the errors found in a file :param source: the source referring the occurrence file and the patches path """ # retrieve details of the source r = check_source(source) if 'occurrences' not in r or 'patches' not in r: print_errors( 'Only sources with occurrences and patches can be checked', do_exit=True) df = pd.read_csv(r['occurrences'], header='infer', sep=';', low_memory=False) nb_errors = 0 errors = [] for idx, row in progressbar.progressbar(enumerate(df.iterrows())): patch_id = str(int(row[1][id_name])) # constructing the path of a patch given its id path = os.path.join(r['patches'], patch_id[-2:], patch_id[-4:-2], patch_id + '.npy') # if the path does not correspond to a file, then it's an error if not os.path.isfile(path): errors.append(row[1][id_name]) nb_errors += 1 if nb_errors > 0: # summary of the error print_info(str(nb_errors) + ' errors found during the check...') if save_errors: # filter the dataframe using the errors df_errors = df[df[id_name].isin(errors)] error_path = output_path('_errors.csv') print_info('Saving error file at: ' + error_path) # save dataframe to the error file df_errors.to_csv(error_path, header=True, index=False, sep=';') if save_filtered: # filter the dataframe keeping the non errors df_filtered = df[~df[id_name].isin(errors)] filtered_path = r['occurrences'] + '.tmp' print_info('Saving filtered dataset at: ' + filtered_path) df_filtered.to_csv(filtered_path, header=True, index=False, sep=';') else: print_info('No error has been found!')