def _get_data(self, file_name, dl_folder, output_path, md5sum): """Download input datafile, unzip and store in output_path. Parameters ---------- file_name : str Name of the file to download. dl_folder : str Path to the folder where to store the downloaded file. output_path : str Full path of output file. md5sum : str Expected MD5 of the downloaded file (after unpacking). """ # Download file and unpack fh = dl_file_gitlab(MODEL_ZOO_REPO_URL, MNIST_REPO_PATH + file_name, dl_folder) with gzip.open(fh, 'rb') as infile: with open(output_path, 'wb') as outfile: for line in infile: outfile.write(line) # Remove download zipped file fm.remove_file(fh) # Check the hash of the downloaded file (unpacked) if md5(output_path) != md5sum: raise RuntimeError('Something wrong happened while ' 'downloading the dataset. Please try again.')
def tearDown(self): # Remove test file(s) if exist try: fm.remove_file(self.test_file) fm.remove_file(self.test_file_2) except (OSError, IOError) as e: if e.errno != 2: raise e
def tearDown(self): # Remove existing 'models_dict.json' before testing if fm.file_exist(MODELS_DICT_PATH): fm.remove_file(MODELS_DICT_PATH) # Removing folder with test model (force 'cause not empty) if fm.folder_exist(fm.join(SECML_MODELS_DIR, '_test')): fm.remove_folder(fm.join(SECML_MODELS_DIR, '_test'), force=True)
def _get_data(self, file_url, dl_folder): """Download input datafile, unzip and store in output_path. Parameters ---------- file_url : str URL of the file to download. dl_folder : str Path to the folder where to store the downloaded file. """ f_dl = fm.join(dl_folder, 'iCubWorld28_128x128.zip?dl=1') if not fm.file_exist(f_dl) or md5(f_dl) != ICUBWORLD28_MD5: # Generate the full path to the downloaded file f_dl = dl_file(file_url, dl_folder, md5_digest=ICUBWORLD28_MD5) self.logger.info("Extracting files...") # Extract the content of downloaded file zipfile.ZipFile(f_dl, 'r').extractall(dl_folder) # Remove downloaded file fm.remove_file(f_dl) # iCubWorld28 zip file contains a macosx private folder, clean it up if fm.folder_exist(fm.join(ICUBWORLD28_PATH, '__MACOSX')): fm.remove_folder(fm.join(ICUBWORLD28_PATH, '__MACOSX'), force=True) # iCubWorld28 zip file contains a macosx private files, clean it up for dirpath, dirnames, filenames in os.walk(ICUBWORLD28_PATH): for file in filenames: if fnmatch(file, '.DS_Store'): fm.remove_file(fm.join(dirpath, file)) # Now move all data to an upper folder if needed if not fm.folder_exist(self._train_path) \ or not fm.folder_exist(self._test_path): sub_d = fm.join(dl_folder, fm.listdir(dl_folder)[0]) for e in fm.listdir(sub_d): e_full = fm.join(sub_d, e) # Full path to current element try: # Call copy_file or copy_folder when applicable if fm.file_exist(e_full) is True: fm.copy_file(e_full, dl_folder) elif fm.folder_exist(e_full) is True: fm.copy_folder(e_full, fm.join(dl_folder, e)) except: pass # Check that the main dataset file is now in the correct folder if not fm.folder_exist(self._train_path) \ or not fm.folder_exist(self._test_path): raise RuntimeError("dataset main file not available!") # The subdirectory can now be removed fm.remove_folder(sub_d, force=True)
def test_save_and_load_svmlight_file(self): """Testing libsvm dataset loading and saving.""" self.logger.info("Testing libsvm dataset loading and saving...") test_file = fm.join(fm.abspath(__file__), "myfile.libsvm") # Cleaning test file try: fm.remove_file(test_file) except (OSError, IOError) as e: if e.errno != 2: raise e self.logger.info("Patterns saved:\n{:}".format(self.patterns)) self.logger.info("Labels saved:\n{:}".format(self.labels)) CDataLoaderSvmLight.dump(CDataset(self.patterns, self.labels), test_file) new_dataset = CDataLoaderSvmLight().load(test_file) self.assertFalse((new_dataset.X != self.patterns).any()) self.assertFalse((new_dataset.Y != self.labels).any()) # load data but now remove all zero features (colums) new_dataset = CDataLoaderSvmLight().load(test_file, remove_all_zero=True) self.logger.info("Patterns loaded:\n{:}".format(new_dataset.X)) self.logger.info("Labels loaded:\n{:}".format(new_dataset.Y)) self.logger.info("Mapping back:\n{:}".format( new_dataset.header.idx_mapping)) self.assertTrue(new_dataset.X.issparse) self.assertTrue(new_dataset.Y.isdense) self.assertTrue(new_dataset.header.idx_mapping.isdense) # non-zero elements should be unchanged self.assertEqual(self.patterns.nnz, new_dataset.X.nnz) new_nnz_data = new_dataset.X.nnz_data self.assertFalse((self.patterns.nnz_data != new_nnz_data.sort()).any()) # With idx_mapping we should be able to reconstruct original data original = CArray.zeros(self.patterns.shape, sparse=True) original[:, new_dataset.header.idx_mapping] = new_dataset.X self.assertFalse((self.patterns != original).any()) # Cleaning test file try: fm.remove_file(test_file) except (OSError, IOError) as e: if e.errno != 2: raise e
def _test_load_model(self, defs_url, model_url, state_url): """Test for `load_model` valid behavior. We test the following: - all valid requests - a need for updating models dict and redownload model - a need for updating models dict and redownload model with a connection error when download models dict Parameters ---------- defs_url : str or None, optional model_url : str or None, optional state_url : str or None, optional """ with requests_mock.Mocker() as m: # Simulate a fine process, with all resources available self._mock_requests(m, defs_url=defs_url, model_url=model_url, state_url=state_url) self._check_test_model() # Call model loading # We now simulate a need for `models_dict.json` update # by removing `.last_update` file fm.remove_file(fm.join(SECML_MODELS_DIR, '.last_update')) # Also remove test model to force re-download fm.remove_folder(fm.join(SECML_MODELS_DIR, '_test'), force=True) self._check_test_model() # Call model loading # We now simulate a need for `models_dict.json` update, # but a connection error occurs (simulated by not mocking dl url) # Last available version of models dict should be used fm.remove_file(fm.join(SECML_MODELS_DIR, '.last_update')) fm.remove_folder(fm.join(SECML_MODELS_DIR, '_test'), force=True) with requests_mock.Mocker() as m: # Do not mock the url for models definitions self._mock_requests(m, defs_url=None, model_url=model_url, state_url=state_url) self._check_test_model() # Call model loading
def test_save_load(self): """Test save/load of sparse arrays""" self.logger.info("UNITTEST - CSparse - save/load") test_file = fm.join(fm.abspath(__file__), 'test.txt') # Cleaning test file try: fm.remove_file(test_file) except (OSError, IOError) as e: if e.errno != 2: raise e self.logger.info( "UNITTEST - CSparse - Testing save/load for sparse matrix") self.sparse_matrix.save(test_file) self.logger.info( "Saving again with overwrite=False... IOError should be raised.") with self.assertRaises(IOError) as e: self.sparse_matrix.save(test_file) self.logger.info(e.exception) loaded_sparse_matrix = CSparse.load(test_file, dtype=int) self.assertFalse((loaded_sparse_matrix != self.sparse_matrix).any(), "Saved and loaded arrays (matrices) are not equal!") self.logger.info( "UNITTEST - CSparse - Testing save/load for sparse vector") self.sparse_vector.save(test_file, overwrite=True) loaded_sparse_vector = CSparse.load(test_file, dtype=int) self.assertFalse((loaded_sparse_vector != self.sparse_vector).any(), "Saved and loaded arrays (vectors) are not equal!") # Cleaning test file try: fm.remove_file(test_file) except (OSError, IOError) as e: if e.errno != 2: raise e
def _get_data(self, file_url, dl_folder, output_path): """Download input datafile, unzip and store in output_path. Parameters ---------- file_url : str URL of the file to download. dl_folder : str Path to the folder where to store the downloaded file. output_path : str Full path of output file. """ # Download file and unpack fh = dl_file(file_url, dl_folder) with gzip.open(fh, 'rb') as infile: with open(output_path, 'wb') as outfile: for line in infile: outfile.write(line) # Remove download zipped file fm.remove_file(fh)
def _test_save_load_model(self, clf, clf_new, ts): """Test for `.save_model` and `.load_model` methods. Parameters ---------- clf : CClassifierPyTorch clf_new : CClassifierPyTorch Another instance of the same classifier. ts : CDataset """ self.assertTrue(clf.is_fitted()) pred_y = clf.predict(ts.X) self.logger.info( "Predictions of the original clf:\n{:}".format(pred_y)) state_path = fm.join(tempfile.gettempdir(), "state.tar") clf.save_model(state_path) clf_new.load_model(state_path) self.logger.info("Testing restored model") # test if predict works even without loss and optimizer del clf_new._loss del clf_new._optimizer del clf_new._optimizer_scheduler pred_y_post = clf_new.predict(ts.X) self.logger.info( "Predictions of the restored model:\n{:}".format(pred_y_post)) self.assert_array_equal(pred_y, pred_y_post) fm.remove_file(state_path)
def _get_models_dict(): """Downloads the ditionary of models definitions. File will be re-downloaded every 30 minutes (upon request) to update the models definitions from repository. Returns ------- models_dict : dict Dictionary with models definitions. Each key is an available model. Each model entry is defined by: - "model", path to the script with model definition - "state", path to the archive containing the pre-saved model state - "model_md5", md5 checksum of model definition - "state_md5", md5 checksum of pre-saved model state """ # The `.last_update` contains the last time MODELS_DICT_FILE # has been download. Read the last update time if this file is available. # Otherwise the file will be created later last_update_path = fm.join(SECML_MODELS_DIR, '.last_update') last_update_format = "%d %m %Y %H:%M" # Specific format to avoid locale current_datetime = datetime.utcnow() # UTC datetime to avoid locale update_models_dict = None # Trigger flag for model definitions update if fm.file_exist(MODELS_DICT_PATH): update_models_dict = True # By default, trigger update if fm.file_exist(last_update_path): try: with open(last_update_path) as fp: last_update = \ datetime.strptime(fp.read(), last_update_format) # Compute the threshold for triggering an update last_update_th = last_update + timedelta(minutes=30) except ValueError as e: # Error occurred while parsing the last update date from file # Clean it and re-create later. Definitions update stays True _logger.debug(e) # Log the error for debug purposes _logger.debug("Removing `{:}`".format(last_update_path)) fm.remove_file(last_update_path) else: # Do not trigger update if last update threshold is not passed if current_datetime < last_update_th: update_models_dict = False if update_models_dict is not False: # if update_models_dict is None means that models dict is not available # if it is True means that an update has been triggered # Either cases, we need to download the data and extract it try: # Catch download errors # Download definitions from current version's branch first, # then from master branch _dl_data_versioned(MODELS_DICT_FILE, SECML_MODELS_DIR) except Exception as e: if update_models_dict is None: # If update_models_dict is still None, means that models dict # is not available, so we propagate the error. Otherwise pass raise e _logger.debug(e) # Log the error for debug purposes _logger.debug("Error when updating the models definitions. " "Using the last available ones...") else: # No error raised during download process # Check if file has been correctly downloaded if not fm.file_exist(MODELS_DICT_PATH): raise RuntimeError( 'Something wrong happened while downloading the ' 'models definitions. Please try again.') # Update or create the "last update" file with open(last_update_path, "w") as fp: fp.write(current_datetime.strftime(last_update_format)) with open(MODELS_DICT_PATH) as fp: return json.loads(fp.read())
def test_save_load(self): self.logger.info("UNITTEST - CDense - save/load matrix") test_file = fm.join(fm.abspath(__file__), 'test.txt') # Cleaning test file try: fm.remove_file(test_file) except (OSError, IOError) as e: if e.errno != 2: raise e a = CDense().zeros((1000, 1000)) with self.timer(): a.save(test_file) with self.timer(): b = CDense().load(test_file, startrow=100, cols=CDense(np.arange(0, 100))) self.assertFalse((a[100:, 0:100] != b).any()) self.logger.info("UNITTEST - CDense - save/load vector") a = CDense().zeros(1000, dtype=int) with self.timer(): a.save(test_file, overwrite=True) with self.timer(): b = CDense().load(test_file, cols=list(range(100, 1000)), dtype=int).ravel() self.assertFalse((a[0, 100] != b).any()) if np.__version__ < '1.18': with self.assertRaises(IndexError) as e: CDense().load(test_file, startrow=10) self.logger.info("Expected error: {:}".format(e.exception)) else: with self.logger.catch_warnings(): self.logger.filterwarnings( "ignore", message="genfromtxt: Empty input file") a = CDense().load(test_file, startrow=10) self.assertEqual(a.size, 0) self.logger.info("UNITTEST - CDense - save/load row vector") a = CDense().zeros((1, 1000)) with self.timer(): a.save(test_file, overwrite=True) with self.timer(): b = CDense().load(test_file, cols=CDense.arange(100, 1000)) self.assertFalse((a[:, 100:] != b).any()) # For some reasons np.genfromtxt does not close the file here # Let's handle the resource warning about unclosed file with self.logger.catch_warnings(): self.logger.filterwarnings("ignore", message="unclosed file") if np.__version__ < '1.18': with self.assertRaises(IndexError) as e: CDense().load(test_file, startrow=10) self.logger.info("Expected error: {:}".format(e.exception)) else: self.logger.filterwarnings( "ignore", message="genfromtxt: Empty input file") a = CDense().load(test_file, startrow=10) self.assertEqual(a.size, 0) self.logger.info("UNITTEST - CDense - save/load negative vector") a = -CDense().zeros(1000) a.save(test_file, overwrite=True) with open(test_file, mode='at+') as fhandle: with self.timer(): a.save(fhandle, overwrite=True) b = CDense().load(test_file) # Simulating double save \w append a = a.atleast_2d().append(a.atleast_2d(), axis=0) self.assertFalse((a != b).any()) a = CDense(['a', 'b']) with self.timer(): a.save(test_file, overwrite=True) b = CDense().load(test_file, dtype=str).ravel() self.assertFalse((a != b).any()) # Cleaning test file try: fm.remove_file(test_file) except (OSError, IOError) as e: if e.errno != 2: raise e
def dl_file(url, output_dir, user=None, headers=None, chunk_size=1024, md5_digest=None): """Download file from input url and store in output_dir. Parameters ---------- url : str Url of the file to download. output_dir : str Path to the directory where the file should be stored. If folder does not exists, will be created. user : str or None, optional String with the user[:password] if required for accessing url. headers : dict or None, optional Dictionary with any additional header for the download request. chunk_size : int, optional Size of the data chunk to read from url in bytes. Default 1024. md5_digest : str or None, optional Expected MD5 digest of the downloaded file. If a different digest is computed, the downloaded file will be removed and ValueError is raised. """ # Parsing user string auth = tuple(user.split(':')) if user is not None else None # If no password is specified, use an empty string auth = (auth[0], '') if auth is not None and len(auth) == 1 else auth r = requests.get(url, auth=auth, headers=headers, stream=True) if r.status_code != 200: raise RuntimeError("File is not available (error code {:})".format( r.status_code)) # Get file size (bytes) if "content-length" in r.headers: total_size = r.headers.get('content-length').strip() total_size = int(total_size) else: # Total size unknown total_size = None dl = 0 if chunk_size < 1: raise ValueError("chunk_size must be at least 1 byte") sys.stdout.write("Downloading from `{:}`".format(url)) if total_size is not None: sys.stdout.write(" ({:} bytes)".format(total_size)) sys.stdout.write("\n") sys.stdout.flush() # Create output directory if not exists if not fm.folder_exist(output_dir): fm.make_folder(output_dir) try: # Get the filename from the response headers fname = re.findall(r"filename=\"(.+)\"", r.headers["Content-Disposition"])[0] except (KeyError, IndexError): # Or use the last part of download url (removing parameters) fname = url.split('/')[-1].split('?', 1)[0] # Build full path of output file out_path = fm.join(output_dir, fname) # Read data and store each chunk with open(out_path, 'wb') as f: for chunk in r.iter_content(chunk_size=chunk_size): if chunk: # filter out keep-alive new chunks f.write(chunk) # Report progress (if total_size is known) if total_size is not None: dl += len(chunk) done = int((50 * dl) / total_size) if sys.stdout.isatty() is True: # Provide real-time updates (if stdout is a tty) sys.stdout.write("\r[{:}{:}] {:}/{:}".format( '=' * done, ' ' * (50 - done), dl, total_size)) sys.stdout.flush() sys.stdout.write("\nFile stored in `{:}`\n".format(out_path)) sys.stdout.flush() if md5_digest is not None and md5_digest != md5(out_path, chunk_size): fm.remove_file(out_path) # Remove the probably-corrupted file raise ValueError("Unexpected MD5 hash for the downloaded file.") return out_path
def setUp(self): # Remove existing 'models_dict.json' before testing if fm.file_exist(MODELS_DICT_PATH): fm.remove_file(MODELS_DICT_PATH)