def test_compress_helper(self): fLOG( __file__, self._testMethodName, OutputPrint=__name__ == "__main__") typbytes = bytes f = os.path.abspath(__file__).replace(".pyc", ".py") rz = zip_files(None, [f], fLOG=fLOG) fLOG(len(rz), type(rz)) if not isinstance(rz, (typbytes, str)): raise TypeError(type(rz)) res = unzip_files(rz) self.assertTrue(isinstance(res, list)) self.assertEqual(len(res), 1) if not isinstance(res[0][1], (typbytes, str)): raise TypeError(type(res[0][1])) self.assertTrue(res[0][0].endswith( "_unittests/ut_filehelper/test_compress_helper.py")) # binary rg = gzip_files(None, [f], fLOG=fLOG) fLOG(len(rg), type(rg)) if not isinstance(rg, typbytes): raise TypeError(type(rg)) res = ungzip_files(rg) self.assertTrue(isinstance(res, list)) self.assertEqual(len(res), 1) if not isinstance(res[0][1], (typbytes, str)): raise TypeError(type(res[0][1])) self.assertTrue(res[0][0].endswith( "_unittests/ut_filehelper/test_compress_helper.py"))
def download_dump(country, name, folder=".", unzip=True, timeout=-1, overwrite=False, fLOG=noLOG): """ Downloads *wikipedia dumps* from `dumps.wikimedia.org/frwiki/latest/ <https://dumps.wikimedia.org/frwiki/latest/>`_. @param country country @param name name of the stream to download @param folder where to download @param unzip unzip the file @param timeout timeout @param overwrite overwrite @param fLOG logging function """ url = "https://dumps.wikimedia.org/{0}wiki/latest/{0}wiki-{1}".format( country, name) file = url.split("/")[-1] # pylint: disable=C0207 name = os.path.join(folder, file) unzipname = os.path.splitext(name)[0] if overwrite or (not os.path.exists(name) and not os.path.exists(unzipname)): get_url_content_timeout(url, timeout=timeout, encoding=None, output=name, chunk=2**20, fLOG=fLOG) if unzip and not os.path.exists(unzipname): names = ungzip_files(name, unzip=False, where_to=folder) os.remove(name) if isinstance(names, list): if len(names) != 1: raise DataException( # pragma: no cover "Expecting only one file, not '{0}'".format(names)) return names[0] return names return name[:-3] if name.endswith('.gz') else name
def test_compress_helper_text(self): fLOG( __file__, self._testMethodName, OutputPrint=__name__ == "__main__") typbytes = bytes f = os.path.abspath(__file__).replace(".pyc", ".py") rg = gzip_files(None, [f], fLOG=fLOG, encoding="utf-8") fLOG(len(rg), type(rg)) if not isinstance(rg, typbytes): raise TypeError(type(rg)) res = ungzip_files(rg, encoding="utf-8") self.assertTrue("test_compress_helper_text" in res)
def download_pageviews(dt, folder=".", unzip=True, timeout=-1, overwrite=False, fLOG=noLOG): """ Downloads wikipedia pagacount for a precise date (up to the hours), the url follows the pattern:: https://dumps.wikimedia.org/other/pageviews/%Y/%Y-%m/pagecounts-%Y%m%d-%H0000.gz @param dt datetime @param folder where to download @param unzip unzip the file @param timeout timeout @param overwrite overwrite @param fLOG logging function @return filename More information on page `pageviews <https://dumps.wikimedia.org/other/pageviews/>`_. """ url = "https://dumps.wikimedia.org/other/pageviews/%Y/%Y-%m/pageviews-%Y%m%d-%H0000.gz" url = dt.strftime(url) file = url.split("/")[-1] name = os.path.join(folder, file) unzipname = os.path.splitext(name)[0] if overwrite or (not os.path.exists(name) and not os.path.exists(unzipname)): get_url_content_timeout(url, timeout=timeout, encoding=None, output=name, chunk=2**20, fLOG=fLOG) if unzip and not os.path.exists(unzipname): names = ungzip_files(name, unzip=False, where_to=folder) os.remove(name) if isinstance(names, list): if len(names) != 1: raise DataException( "Expecting only one file, not '{0}'".format(names)) return names[0] else: return names else: return name
def unzip_files(self, group): """ Unzips files and convert notebooks into :epkg:`HTML`. @param group group name @return list of new filess """ def fvalid(zip_name, local_name): if "__pycache__" in zip_name: return False if zip_name.endswith(".pyc"): return False return True names = list(self.enumerate_group_files(group)) files = [] for name in names: if "attachments" not in name: continue ext = os.path.splitext(name)[-1] if ext == ".zip": folder = os.path.splitext(name)[0] + "_zip" folder = folder.replace(" ", "_").replace(",", "_") if not os.path.exists(folder): self.fLOG( "[ProjectsRepository.unzip_files] unzip '{0}'".format( name)) self.fLOG( "[ProjectsRepository.unzip_files] creating '{0}'". format(folder)) os.mkdir(folder) try: lf = unzip_files(name, folder, fLOG=self.fLOG, fvalid=fvalid, fail_if_error=False) except zipfile.BadZipFile as e: self.fLOG( "[ProjectsRepository.unzip_files] ERROR: unable to unzip '{0}' because of '{1}']" .format(name, e)) lf = [] files.extend(lf) else: # already done, we do not do it again pass elif ext == ".7z": folder = os.path.splitext(name)[0] + "_7z" folder = folder.replace(" ", "_").replace(",", "_") if not os.path.exists(folder): self.fLOG("[ProjectsRepository.un7zip_files] un7zip '{0}'". format(name)) self.fLOG( "[ProjectsRepository.un7zip_files] creating '{0}'". format(folder)) os.mkdir(folder) lf = un7zip_files(name, folder, fLOG=self.fLOG, fvalid=fvalid) files.extend(lf) else: # already done, we do not do it again pass elif ext == ".rar": folder = os.path.splitext(name)[0] + "_rar" folder = folder.replace(" ", "_").replace(",", "_") if not os.path.exists(folder): self.fLOG( "[ProjectsRepository.unrar_files] unrar '{0}'".format( name)) self.fLOG( "[ProjectsRepository.unrar_files] creating '{0}'". format(folder)) os.mkdir(folder) lf = unrar_files(name, folder, fLOG=self.fLOG, fvalid=fvalid) files.extend(lf) else: # already done, we do not do it again pass elif ext == ".gz": folder = os.path.splitext(name)[0] + "_gz" folder = folder.replace(" ", "_").replace(",", "_") if not os.path.exists(folder): self.fLOG("[ProjectsRepository.ungzip_files] ungzip '{0}'". format(name)) self.fLOG( "[ProjectsRepository.ungzip_files] creating '{0}'". format(folder)) os.mkdir(folder) unzip = "pkl.gz" not in name lf = ungzip_files(name, folder, fLOG=self.fLOG, fvalid=fvalid, unzip=unzip) files.extend(lf) else: # already done, we do not do it again pass elif ext == ".tar.gz": raise Exception("unable to process such a file: " + name) return files
def unzip_files(self, group): """ unzip files and convert notebooks into html @param group group name @return list of new filess """ def fvalid(zip_name, local_name): if "__pycache__" in zip_name: return False if zip_name.endswith(".pyc"): return False return True names = list(self.enumerate_group_files(group)) files = [] for name in names: if "attachments" not in name: continue ext = os.path.splitext(name)[-1] if ext == ".zip": folder = os.path.splitext(name)[0] + "_zip" folder = folder.replace(" ", "_").replace(",", "_") if not os.path.exists(folder): self.fLOG( "ProjectsRepository.unzip_files [unzip {0}]".format( name)) self.fLOG( "ProjectsRepository.unzip_files [creating {0}]".format( folder)) os.mkdir(folder) lf = unzip_files(name, folder, fLOG=self.fLOG, fvalid=fvalid) files.extend(lf) else: # already done, we do not do it again pass elif ext == ".7z": folder = os.path.splitext(name)[0] + "_7z" folder = folder.replace(" ", "_").replace(",", "_") if not os.path.exists(folder): self.fLOG( "ProjectsRepository.un7zip_files [un7zip {0}]".format( name)) self.fLOG("ProjectsRepository.un7zip_files [creating {0}]". format(folder)) os.mkdir(folder) lf = un7zip_files(name, folder, fLOG=self.fLOG, fvalid=fvalid) files.extend(lf) else: # already done, we do not do it again pass elif ext == ".gz": folder = os.path.splitext(name)[0] + "_gz" folder = folder.replace(" ", "_").replace(",", "_") if not os.path.exists(folder): self.fLOG( "ProjectsRepository.ungzip_files [ungzip {0}]".format( name)) self.fLOG("ProjectsRepository.ungzip_files [creating {0}]". format(folder)) os.mkdir(folder) lf = ungzip_files(name, folder, fLOG=self.fLOG, fvalid=fvalid) files.extend(lf) else: # already done, we do not do it again pass elif ext == ".tar.gz": raise Exception("unable to process such a file: " + name) return files
def unzip_files(self, group): """ Unzips files and convert notebooks into :epkg:`HTML`. @param group group name @return list of new filess """ def fvalid(zip_name, local_name): if "__pycache__" in zip_name: return False if zip_name.endswith(".pyc"): return False return True names = list(self.enumerate_group_files(group)) files = [] for name in names: if "attachments" not in name: continue ext = os.path.splitext(name)[-1] if ext == ".zip": folder = os.path.splitext(name)[0] + "_zip" folder = folder.replace(" ", "_").replace(",", "_") if not os.path.exists(folder): self.fLOG( "[ProjectsRepository.unzip_files] unzip '{0}'".format(name)) self.fLOG( "[ProjectsRepository.unzip_files] creating '{0}'".format(folder)) os.mkdir(folder) try: lf = unzip_files( name, folder, fLOG=self.fLOG, fvalid=fvalid, fail_if_error=False) except zipfile.BadZipFile as e: self.fLOG( "[ProjectsRepository.unzip_files] ERROR: unable to unzip '{0}' because of '{1}']".format(name, e)) lf = [] files.extend(lf) else: # already done, we do not do it again pass elif ext == ".7z": folder = os.path.splitext(name)[0] + "_7z" folder = folder.replace(" ", "_").replace(",", "_") if not os.path.exists(folder): self.fLOG( "[ProjectsRepository.un7zip_files] un7zip '{0}'".format(name)) self.fLOG( "[ProjectsRepository.un7zip_files] creating '{0}'".format(folder)) os.mkdir(folder) lf = un7zip_files( name, folder, fLOG=self.fLOG, fvalid=fvalid) files.extend(lf) else: # already done, we do not do it again pass elif ext == ".rar": folder = os.path.splitext(name)[0] + "_rar" folder = folder.replace(" ", "_").replace(",", "_") if not os.path.exists(folder): self.fLOG( "[ProjectsRepository.unrar_files] unrar '{0}'".format(name)) self.fLOG( "[ProjectsRepository.unrar_files] creating '{0}'".format(folder)) os.mkdir(folder) lf = unrar_files( name, folder, fLOG=self.fLOG, fvalid=fvalid) files.extend(lf) else: # already done, we do not do it again pass elif ext == ".gz": folder = os.path.splitext(name)[0] + "_gz" folder = folder.replace(" ", "_").replace(",", "_") if not os.path.exists(folder): self.fLOG( "[ProjectsRepository.ungzip_files] ungzip '{0}'".format(name)) self.fLOG( "[ProjectsRepository.ungzip_files] creating '{0}'".format(folder)) os.mkdir(folder) unzip = "pkl.gz" not in name lf = ungzip_files( name, folder, fLOG=self.fLOG, fvalid=fvalid, unzip=unzip) files.extend(lf) else: # already done, we do not do it again pass elif ext == ".tar.gz": raise Exception("unable to process such a file: " + name) return files
def load_adult_dataset(download=True, small=False, url='uci'): """ Retourne le jeu de données `Adult Data Set <https://archive.ics.uci.edu/ml/datasets/adult>`_. Les variables sont principalement catégorielles. Notebooks associés à ce jeu de données : .. runpython:: :rst: from papierstat.datasets.documentation import list_notebooks_rst_links links = list_notebooks_rst_links('lectures', 'adult') links = [' * %s' % s for s in links] print('\\n'.join(links)) @param download télécharge le jeu de données ou considères une copie en local. @param small récupère une version allégée en local @param url source @return :epkg:`pandas:DataFrame` (train, test) """ columns = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', '<=50K'] if small: fold = get_data_folder() data_train = os.path.join(fold, 'adult.data.gz') data_test = os.path.join(fold, 'adult.test.gz') train = pandas.read_csv(data_train, header=None) test = pandas.read_csv(data_test, header=None) train.columns = columns test.columns = columns elif download: if url == 'uci': url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/" train = pandas.read_csv(url + "adult.data", header=None) test = pandas.read_csv(url + "adult.test", header=None, skiprows=1) else: url = "http://www.xavierdupre.fr/enseignement/complements/" tr = read_content_ufs(url + "adult.data.gz", asbytes=True, encoding=None, min_size=400000) by = BytesIO(tr) tx = ungzip_files(by, unzip=False) st = StringIO(tx.decode('ascii')) train = pandas.read_csv(st, header=None) te = read_content_ufs(url + "adult.test.gz", asbytes=True, encoding=None, min_size=200000) by = BytesIO(te) tx = ungzip_files(by, unzip=False) st = StringIO(tx.decode('ascii')) test = pandas.read_csv(st, header=None, skiprows=1) train.columns = columns test.columns = columns else: raise NotImplementedError( # pragma: no cover "No local copy") label = '<=50K' train[label] = train[label].str.strip(' .') test[label] = test[label].str.strip(' .') cols = train.select_dtypes(object).columns for c in cols: train[c] = train[c].str.strip() for c in cols: test[c] = test[c].str.strip() return train, test
def unzip_files(self, group): """ unzip files and convert notebooks into html @param group group name @return list of new filess """ def fvalid(zip_name, local_name): if "__pycache__" in zip_name: return False if zip_name.endswith(".pyc"): return False return True names = list(self.enumerate_group_files(group)) files = [] for name in names: if "attachments" not in name: continue ext = os.path.splitext(name)[-1] if ext == ".zip": folder = os.path.splitext(name)[0] + "_zip" folder = folder.replace(" ", "_").replace(",", "_") if not os.path.exists(folder): self.fLOG( "ProjectsRepository.unzip_files [unzip {0}]".format(name)) self.fLOG( "ProjectsRepository.unzip_files [creating {0}]".format(folder)) os.mkdir(folder) lf = unzip_files( name, folder, fLOG=self.fLOG, fvalid=fvalid) files.extend(lf) else: # already done, we do not do it again pass elif ext == ".7z": folder = os.path.splitext(name)[0] + "_7z" folder = folder.replace(" ", "_").replace(",", "_") if not os.path.exists(folder): self.fLOG( "ProjectsRepository.un7zip_files [un7zip {0}]".format(name)) self.fLOG( "ProjectsRepository.un7zip_files [creating {0}]".format(folder)) os.mkdir(folder) lf = un7zip_files( name, folder, fLOG=self.fLOG, fvalid=fvalid) files.extend(lf) else: # already done, we do not do it again pass elif ext == ".gz": folder = os.path.splitext(name)[0] + "_gz" folder = folder.replace(" ", "_").replace(",", "_") if not os.path.exists(folder): self.fLOG( "ProjectsRepository.ungzip_files [ungzip {0}]".format(name)) self.fLOG( "ProjectsRepository.ungzip_files [creating {0}]".format(folder)) os.mkdir(folder) lf = ungzip_files( name, folder, fLOG=self.fLOG, fvalid=fvalid) files.extend(lf) else: # already done, we do not do it again pass elif ext == ".tar.gz": raise Exception("unable to process such a file: " + name) return files