def test_compress_helper(self):
        fLOG(
            __file__,
            self._testMethodName,
            OutputPrint=__name__ == "__main__")

        typbytes = bytes
        f = os.path.abspath(__file__).replace(".pyc", ".py")

        rz = zip_files(None, [f], fLOG=fLOG)
        fLOG(len(rz), type(rz))
        if not isinstance(rz, (typbytes, str)):
            raise TypeError(type(rz))

        res = unzip_files(rz)
        self.assertTrue(isinstance(res, list))
        self.assertEqual(len(res), 1)
        if not isinstance(res[0][1], (typbytes, str)):
            raise TypeError(type(res[0][1]))
        self.assertTrue(res[0][0].endswith(
            "_unittests/ut_filehelper/test_compress_helper.py"))

        # binary
        rg = gzip_files(None, [f], fLOG=fLOG)
        fLOG(len(rg), type(rg))
        if not isinstance(rg, typbytes):
            raise TypeError(type(rg))

        res = ungzip_files(rg)
        self.assertTrue(isinstance(res, list))
        self.assertEqual(len(res), 1)
        if not isinstance(res[0][1], (typbytes, str)):
            raise TypeError(type(res[0][1]))
        self.assertTrue(res[0][0].endswith(
            "_unittests/ut_filehelper/test_compress_helper.py"))
示例#2
0
def download_dump(country, name, folder=".", unzip=True, timeout=-1,
                  overwrite=False, fLOG=noLOG):
    """
    Downloads *wikipedia dumps* from
    `dumps.wikimedia.org/frwiki/latest/
    <https://dumps.wikimedia.org/frwiki/latest/>`_.

    @param      country     country
    @param      name        name of the stream to download
    @param      folder      where to download
    @param      unzip       unzip the file
    @param      timeout     timeout
    @param      overwrite   overwrite
    @param      fLOG        logging function
    """
    url = "https://dumps.wikimedia.org/{0}wiki/latest/{0}wiki-{1}".format(
        country, name)
    file = url.split("/")[-1]  # pylint: disable=C0207
    name = os.path.join(folder, file)
    unzipname = os.path.splitext(name)[0]
    if overwrite or (not os.path.exists(name) and not os.path.exists(unzipname)):
        get_url_content_timeout(url, timeout=timeout,
                                encoding=None, output=name, chunk=2**20, fLOG=fLOG)
    if unzip and not os.path.exists(unzipname):
        names = ungzip_files(name, unzip=False, where_to=folder)
        os.remove(name)
        if isinstance(names, list):
            if len(names) != 1:
                raise DataException(  # pragma: no cover
                    "Expecting only one file, not '{0}'".format(names))
            return names[0]
        return names
    return name[:-3] if name.endswith('.gz') else name
    def test_compress_helper(self):
        fLOG(
            __file__,
            self._testMethodName,
            OutputPrint=__name__ == "__main__")

        typbytes = bytes
        f = os.path.abspath(__file__).replace(".pyc", ".py")

        rz = zip_files(None, [f], fLOG=fLOG)
        fLOG(len(rz), type(rz))
        if not isinstance(rz, (typbytes, str)):
            raise TypeError(type(rz))

        res = unzip_files(rz)
        self.assertTrue(isinstance(res, list))
        self.assertEqual(len(res), 1)
        if not isinstance(res[0][1], (typbytes, str)):
            raise TypeError(type(res[0][1]))
        self.assertTrue(res[0][0].endswith(
            "_unittests/ut_filehelper/test_compress_helper.py"))

        # binary
        rg = gzip_files(None, [f], fLOG=fLOG)
        fLOG(len(rg), type(rg))
        if not isinstance(rg, typbytes):
            raise TypeError(type(rg))

        res = ungzip_files(rg)
        self.assertTrue(isinstance(res, list))
        self.assertEqual(len(res), 1)
        if not isinstance(res[0][1], (typbytes, str)):
            raise TypeError(type(res[0][1]))
        self.assertTrue(res[0][0].endswith(
            "_unittests/ut_filehelper/test_compress_helper.py"))
    def test_compress_helper_text(self):
        fLOG(
            __file__,
            self._testMethodName,
            OutputPrint=__name__ == "__main__")
        typbytes = bytes
        f = os.path.abspath(__file__).replace(".pyc", ".py")
        rg = gzip_files(None, [f], fLOG=fLOG, encoding="utf-8")
        fLOG(len(rg), type(rg))
        if not isinstance(rg, typbytes):
            raise TypeError(type(rg))

        res = ungzip_files(rg, encoding="utf-8")
        self.assertTrue("test_compress_helper_text" in res)
    def test_compress_helper_text(self):
        fLOG(
            __file__,
            self._testMethodName,
            OutputPrint=__name__ == "__main__")
        typbytes = bytes
        f = os.path.abspath(__file__).replace(".pyc", ".py")
        rg = gzip_files(None, [f], fLOG=fLOG, encoding="utf-8")
        fLOG(len(rg), type(rg))
        if not isinstance(rg, typbytes):
            raise TypeError(type(rg))

        res = ungzip_files(rg, encoding="utf-8")
        self.assertTrue("test_compress_helper_text" in res)
示例#6
0
def download_pageviews(dt,
                       folder=".",
                       unzip=True,
                       timeout=-1,
                       overwrite=False,
                       fLOG=noLOG):
    """
    Downloads wikipedia pagacount for a precise date (up to the hours),
    the url follows the pattern::

        https://dumps.wikimedia.org/other/pageviews/%Y/%Y-%m/pagecounts-%Y%m%d-%H0000.gz

    @param      dt          datetime
    @param      folder      where to download
    @param      unzip       unzip the file
    @param      timeout     timeout
    @param      overwrite   overwrite
    @param      fLOG        logging function
    @return                 filename

    More information on page `pageviews <https://dumps.wikimedia.org/other/pageviews/>`_.
    """
    url = "https://dumps.wikimedia.org/other/pageviews/%Y/%Y-%m/pageviews-%Y%m%d-%H0000.gz"
    url = dt.strftime(url)
    file = url.split("/")[-1]
    name = os.path.join(folder, file)
    unzipname = os.path.splitext(name)[0]
    if overwrite or (not os.path.exists(name)
                     and not os.path.exists(unzipname)):
        get_url_content_timeout(url,
                                timeout=timeout,
                                encoding=None,
                                output=name,
                                chunk=2**20,
                                fLOG=fLOG)
    if unzip and not os.path.exists(unzipname):
        names = ungzip_files(name, unzip=False, where_to=folder)
        os.remove(name)
        if isinstance(names, list):
            if len(names) != 1:
                raise DataException(
                    "Expecting only one file, not '{0}'".format(names))
            return names[0]
        else:
            return names
    else:
        return name
示例#7
0
    def unzip_files(self, group):
        """
        Unzips files and convert notebooks into :epkg:`HTML`.

        @param          group       group name
        @return                     list of new filess
        """
        def fvalid(zip_name, local_name):
            if "__pycache__" in zip_name:
                return False
            if zip_name.endswith(".pyc"):
                return False
            return True

        names = list(self.enumerate_group_files(group))
        files = []
        for name in names:
            if "attachments" not in name:
                continue
            ext = os.path.splitext(name)[-1]
            if ext == ".zip":
                folder = os.path.splitext(name)[0] + "_zip"
                folder = folder.replace(" ", "_").replace(",", "_")
                if not os.path.exists(folder):
                    self.fLOG(
                        "[ProjectsRepository.unzip_files] unzip '{0}'".format(
                            name))
                    self.fLOG(
                        "[ProjectsRepository.unzip_files] creating '{0}'".
                        format(folder))
                    os.mkdir(folder)
                    try:
                        lf = unzip_files(name,
                                         folder,
                                         fLOG=self.fLOG,
                                         fvalid=fvalid,
                                         fail_if_error=False)
                    except zipfile.BadZipFile as e:
                        self.fLOG(
                            "[ProjectsRepository.unzip_files]    ERROR: unable to unzip '{0}' because of '{1}']"
                            .format(name, e))
                        lf = []
                    files.extend(lf)
                else:
                    # already done, we do not do it again
                    pass
            elif ext == ".7z":
                folder = os.path.splitext(name)[0] + "_7z"
                folder = folder.replace(" ", "_").replace(",", "_")
                if not os.path.exists(folder):
                    self.fLOG("[ProjectsRepository.un7zip_files] un7zip '{0}'".
                              format(name))
                    self.fLOG(
                        "[ProjectsRepository.un7zip_files] creating '{0}'".
                        format(folder))
                    os.mkdir(folder)
                    lf = un7zip_files(name,
                                      folder,
                                      fLOG=self.fLOG,
                                      fvalid=fvalid)
                    files.extend(lf)
                else:
                    # already done, we do not do it again
                    pass
            elif ext == ".rar":
                folder = os.path.splitext(name)[0] + "_rar"
                folder = folder.replace(" ", "_").replace(",", "_")
                if not os.path.exists(folder):
                    self.fLOG(
                        "[ProjectsRepository.unrar_files] unrar '{0}'".format(
                            name))
                    self.fLOG(
                        "[ProjectsRepository.unrar_files] creating '{0}'".
                        format(folder))
                    os.mkdir(folder)
                    lf = unrar_files(name,
                                     folder,
                                     fLOG=self.fLOG,
                                     fvalid=fvalid)
                    files.extend(lf)
                else:
                    # already done, we do not do it again
                    pass
            elif ext == ".gz":
                folder = os.path.splitext(name)[0] + "_gz"
                folder = folder.replace(" ", "_").replace(",", "_")
                if not os.path.exists(folder):
                    self.fLOG("[ProjectsRepository.ungzip_files] ungzip '{0}'".
                              format(name))
                    self.fLOG(
                        "[ProjectsRepository.ungzip_files] creating '{0}'".
                        format(folder))
                    os.mkdir(folder)
                    unzip = "pkl.gz" not in name
                    lf = ungzip_files(name,
                                      folder,
                                      fLOG=self.fLOG,
                                      fvalid=fvalid,
                                      unzip=unzip)
                    files.extend(lf)
                else:
                    # already done, we do not do it again
                    pass
            elif ext == ".tar.gz":
                raise Exception("unable to process such a file: " + name)
        return files
    def unzip_files(self, group):
        """
        unzip files and convert notebooks into html

        @param          group       group name
        @return                     list of new filess
        """
        def fvalid(zip_name, local_name):
            if "__pycache__" in zip_name:
                return False
            if zip_name.endswith(".pyc"):
                return False
            return True

        names = list(self.enumerate_group_files(group))
        files = []
        for name in names:
            if "attachments" not in name:
                continue
            ext = os.path.splitext(name)[-1]
            if ext == ".zip":
                folder = os.path.splitext(name)[0] + "_zip"
                folder = folder.replace(" ", "_").replace(",", "_")
                if not os.path.exists(folder):
                    self.fLOG(
                        "ProjectsRepository.unzip_files [unzip {0}]".format(
                            name))
                    self.fLOG(
                        "ProjectsRepository.unzip_files [creating {0}]".format(
                            folder))
                    os.mkdir(folder)
                    lf = unzip_files(name,
                                     folder,
                                     fLOG=self.fLOG,
                                     fvalid=fvalid)
                    files.extend(lf)
                else:
                    # already done, we do not do it again
                    pass
            elif ext == ".7z":
                folder = os.path.splitext(name)[0] + "_7z"
                folder = folder.replace(" ", "_").replace(",", "_")
                if not os.path.exists(folder):
                    self.fLOG(
                        "ProjectsRepository.un7zip_files [un7zip {0}]".format(
                            name))
                    self.fLOG("ProjectsRepository.un7zip_files [creating {0}]".
                              format(folder))
                    os.mkdir(folder)
                    lf = un7zip_files(name,
                                      folder,
                                      fLOG=self.fLOG,
                                      fvalid=fvalid)
                    files.extend(lf)
                else:
                    # already done, we do not do it again
                    pass
            elif ext == ".gz":
                folder = os.path.splitext(name)[0] + "_gz"
                folder = folder.replace(" ", "_").replace(",", "_")
                if not os.path.exists(folder):
                    self.fLOG(
                        "ProjectsRepository.ungzip_files [ungzip {0}]".format(
                            name))
                    self.fLOG("ProjectsRepository.ungzip_files [creating {0}]".
                              format(folder))
                    os.mkdir(folder)
                    lf = ungzip_files(name,
                                      folder,
                                      fLOG=self.fLOG,
                                      fvalid=fvalid)
                    files.extend(lf)
                else:
                    # already done, we do not do it again
                    pass
            elif ext == ".tar.gz":
                raise Exception("unable to process such a file: " + name)
        return files
    def unzip_files(self, group):
        """
        Unzips files and convert notebooks into :epkg:`HTML`.

        @param          group       group name
        @return                     list of new filess
        """
        def fvalid(zip_name, local_name):
            if "__pycache__" in zip_name:
                return False
            if zip_name.endswith(".pyc"):
                return False
            return True

        names = list(self.enumerate_group_files(group))
        files = []
        for name in names:
            if "attachments" not in name:
                continue
            ext = os.path.splitext(name)[-1]
            if ext == ".zip":
                folder = os.path.splitext(name)[0] + "_zip"
                folder = folder.replace(" ", "_").replace(",", "_")
                if not os.path.exists(folder):
                    self.fLOG(
                        "[ProjectsRepository.unzip_files] unzip '{0}'".format(name))
                    self.fLOG(
                        "[ProjectsRepository.unzip_files] creating '{0}'".format(folder))
                    os.mkdir(folder)
                    try:
                        lf = unzip_files(
                            name, folder, fLOG=self.fLOG, fvalid=fvalid, fail_if_error=False)
                    except zipfile.BadZipFile as e:
                        self.fLOG(
                            "[ProjectsRepository.unzip_files]    ERROR: unable to unzip '{0}' because of '{1}']".format(name, e))
                        lf = []
                    files.extend(lf)
                else:
                    # already done, we do not do it again
                    pass
            elif ext == ".7z":
                folder = os.path.splitext(name)[0] + "_7z"
                folder = folder.replace(" ", "_").replace(",", "_")
                if not os.path.exists(folder):
                    self.fLOG(
                        "[ProjectsRepository.un7zip_files] un7zip '{0}'".format(name))
                    self.fLOG(
                        "[ProjectsRepository.un7zip_files] creating '{0}'".format(folder))
                    os.mkdir(folder)
                    lf = un7zip_files(
                        name, folder, fLOG=self.fLOG, fvalid=fvalid)
                    files.extend(lf)
                else:
                    # already done, we do not do it again
                    pass
            elif ext == ".rar":
                folder = os.path.splitext(name)[0] + "_rar"
                folder = folder.replace(" ", "_").replace(",", "_")
                if not os.path.exists(folder):
                    self.fLOG(
                        "[ProjectsRepository.unrar_files] unrar '{0}'".format(name))
                    self.fLOG(
                        "[ProjectsRepository.unrar_files] creating '{0}'".format(folder))
                    os.mkdir(folder)
                    lf = unrar_files(
                        name, folder, fLOG=self.fLOG, fvalid=fvalid)
                    files.extend(lf)
                else:
                    # already done, we do not do it again
                    pass
            elif ext == ".gz":
                folder = os.path.splitext(name)[0] + "_gz"
                folder = folder.replace(" ", "_").replace(",", "_")
                if not os.path.exists(folder):
                    self.fLOG(
                        "[ProjectsRepository.ungzip_files] ungzip '{0}'".format(name))
                    self.fLOG(
                        "[ProjectsRepository.ungzip_files] creating '{0}'".format(folder))
                    os.mkdir(folder)
                    unzip = "pkl.gz" not in name
                    lf = ungzip_files(
                        name, folder, fLOG=self.fLOG, fvalid=fvalid, unzip=unzip)
                    files.extend(lf)
                else:
                    # already done, we do not do it again
                    pass
            elif ext == ".tar.gz":
                raise Exception("unable to process such a file: " + name)
        return files
示例#10
0
def load_adult_dataset(download=True, small=False, url='uci'):
    """
    Retourne le jeu de données
    `Adult Data Set  <https://archive.ics.uci.edu/ml/datasets/adult>`_.
    Les variables sont principalement catégorielles.
    Notebooks associés à ce jeu de données :

    .. runpython::
        :rst:

        from papierstat.datasets.documentation import list_notebooks_rst_links
        links = list_notebooks_rst_links('lectures', 'adult')
        links = ['    * %s' % s for s in links]
        print('\\n'.join(links))

    @param  download    télécharge le jeu de données ou considères une copie en local.
    @param  small       récupère une version allégée en local
    @param  url         source
    @return             :epkg:`pandas:DataFrame` (train, test)
    """
    columns = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status',
               'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss',
               'hours_per_week', 'native_country', '<=50K']

    if small:
        fold = get_data_folder()
        data_train = os.path.join(fold, 'adult.data.gz')
        data_test = os.path.join(fold, 'adult.test.gz')
        train = pandas.read_csv(data_train, header=None)
        test = pandas.read_csv(data_test, header=None)
        train.columns = columns
        test.columns = columns
    elif download:
        if url == 'uci':
            url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/"
            train = pandas.read_csv(url + "adult.data", header=None)
            test = pandas.read_csv(url + "adult.test", header=None, skiprows=1)
        else:
            url = "http://www.xavierdupre.fr/enseignement/complements/"
            tr = read_content_ufs(url + "adult.data.gz",
                                  asbytes=True, encoding=None,
                                  min_size=400000)
            by = BytesIO(tr)
            tx = ungzip_files(by, unzip=False)
            st = StringIO(tx.decode('ascii'))
            train = pandas.read_csv(st, header=None)
            te = read_content_ufs(url + "adult.test.gz",
                                  asbytes=True, encoding=None,
                                  min_size=200000)
            by = BytesIO(te)
            tx = ungzip_files(by, unzip=False)
            st = StringIO(tx.decode('ascii'))
            test = pandas.read_csv(st, header=None, skiprows=1)
        train.columns = columns
        test.columns = columns
    else:
        raise NotImplementedError(  # pragma: no cover
            "No local copy")
    label = '<=50K'
    train[label] = train[label].str.strip(' .')
    test[label] = test[label].str.strip(' .')
    cols = train.select_dtypes(object).columns
    for c in cols:
        train[c] = train[c].str.strip()
    for c in cols:
        test[c] = test[c].str.strip()
    return train, test
示例#11
0
    def unzip_files(self, group):
        """
        unzip files and convert notebooks into html

        @param          group       group name
        @return                     list of new filess
        """
        def fvalid(zip_name, local_name):
            if "__pycache__" in zip_name:
                return False
            if zip_name.endswith(".pyc"):
                return False
            return True

        names = list(self.enumerate_group_files(group))
        files = []
        for name in names:
            if "attachments" not in name:
                continue
            ext = os.path.splitext(name)[-1]
            if ext == ".zip":
                folder = os.path.splitext(name)[0] + "_zip"
                folder = folder.replace(" ", "_").replace(",", "_")
                if not os.path.exists(folder):
                    self.fLOG(
                        "ProjectsRepository.unzip_files [unzip {0}]".format(name))
                    self.fLOG(
                        "ProjectsRepository.unzip_files [creating {0}]".format(folder))
                    os.mkdir(folder)
                    lf = unzip_files(
                        name, folder, fLOG=self.fLOG, fvalid=fvalid)
                    files.extend(lf)
                else:
                    # already done, we do not do it again
                    pass
            elif ext == ".7z":
                folder = os.path.splitext(name)[0] + "_7z"
                folder = folder.replace(" ", "_").replace(",", "_")
                if not os.path.exists(folder):
                    self.fLOG(
                        "ProjectsRepository.un7zip_files [un7zip {0}]".format(name))
                    self.fLOG(
                        "ProjectsRepository.un7zip_files [creating {0}]".format(folder))
                    os.mkdir(folder)
                    lf = un7zip_files(
                        name, folder, fLOG=self.fLOG, fvalid=fvalid)
                    files.extend(lf)
                else:
                    # already done, we do not do it again
                    pass
            elif ext == ".gz":
                folder = os.path.splitext(name)[0] + "_gz"
                folder = folder.replace(" ", "_").replace(",", "_")
                if not os.path.exists(folder):
                    self.fLOG(
                        "ProjectsRepository.ungzip_files [ungzip {0}]".format(name))
                    self.fLOG(
                        "ProjectsRepository.ungzip_files [creating {0}]".format(folder))
                    os.mkdir(folder)
                    lf = ungzip_files(
                        name, folder, fLOG=self.fLOG, fvalid=fvalid)
                    files.extend(lf)
                else:
                    # already done, we do not do it again
                    pass
            elif ext == ".tar.gz":
                raise Exception("unable to process such a file: " + name)
        return files