示例#1
0
    def test_theano_logreg(self):
        fLOG(__file__,
             self._testMethodName,
             OutputPrint=__name__ == "__main__")

        if is_travis_or_appveyor():
            # it requires latex
            return

        from theano import config
        fLOG(config)
        from src.ensae_teaching_cs.examples.theano_logreg import theano_sgd_optimization_mnist, theano_predict
        temp = get_temp_folder(__file__, "temp__theano_logreg")
        dataset = "mnist.pkl.gz"
        if not os.path.exists(dataset):
            download_data(dataset,
                          website="http://deeplearning.net/data/mnist/")
        model = os.path.join(temp, "log_reg_theano.bin")
        theano_sgd_optimization_mnist(dataset=dataset,
                                      saved_model=model,
                                      n_epochs=2,
                                      fLOG=fLOG)
        pred = theano_predict(model, dataset, 10)
        fLOG(pred)
        fLOG(type(pred))
        self.assertEqual(len(pred), 10)
示例#2
0
    def test_theano_logreg(self):
        fLOG(
            __file__,
            self._testMethodName,
            OutputPrint=__name__ == "__main__")

        if is_travis_or_appveyor():
            # it requires latex
            return

        from theano import config
        fLOG(config)
        from src.ensae_teaching_cs.examples.theano_logreg import theano_sgd_optimization_mnist, theano_predict
        temp = get_temp_folder(__file__, "temp__theano_logreg")
        dataset = "mnist.pkl.gz"
        if not os.path.exists(dataset):
            download_data(
                dataset, website="http://deeplearning.net/data/mnist/")
        model = os.path.join(temp, "log_reg_theano.bin")
        theano_sgd_optimization_mnist(
            dataset=dataset, saved_model=model, n_epochs=2, fLOG=fLOG)
        pred = theano_predict(model, dataset, 10)
        fLOG(pred)
        fLOG(type(pred))
        self.assertEqual(len(pred), 10)
示例#3
0
def wolf_xml(url="http://pauillac.inria.fr/~sagot/index.html",
             temp_folder=".",
             fLOG=noLOG):
    """
    The `WOLF <http://alpage.inria.fr/~sagot/wolf-en.html>`_
    (Wordnet Libre du Français, Free French Wordnet) is a free semantic
    lexical resource (wordnet) for French.

    This data is licensed under `Cecill-C license
    <http://www.cecill.info/licences/Licence_CeCILL-C_V1-en.html>`_.
    Language is French.

    @param      url             url
    @param      fLOG            logging function
    @param      temp_folder     where to download
    @return                     list of files
    """
    link = url
    page = download_page(link)
    reg = re.compile("href=\\\"(https.*?wolf.*?[.]bz2)\\\"")
    alls = reg.findall(page)
    if len(alls) == 0:
        raise LinkNotFoundError(  # pragma: no cover
            "unable to find a link on a .bz2 file on page\n{}".format(page))

    url = alls[0]
    spl = url.split("/")
    url = "/".join(spl[:-1]) + "/"
    url2 = "/".join(spl[:-2]) + "/31718/"
    try:
        dtd = download_data("debvisdic-strict.dtd",
                            url=[url2, "xd"],
                            fLOG=fLOG,
                            whereTo=temp_folder)
    except DownloadDataException:
        dtd = None
    name = spl[-1].strip('.')

    try:
        local = download_data(name,
                              url=[url, "xd"],
                              fLOG=fLOG,
                              whereTo=temp_folder)
    except DownloadDataException:
        local = None
    if local is not None and isinstance(local, str):
        local = [local]
    # We check the file was downloaded.
    expected = os.path.join(temp_folder, "wolf-1.0b4.xml")
    if local is None or not os.path.exists(expected):  # pragma: no cover
        res = download_data("wolf-1.0b4.xml.zip",
                            whereTo=temp_folder,
                            fLOG=fLOG)
        if not os.path.exists(expected):
            raise FileNotFoundError(expected)
        return res
    elif isinstance(dtd, list):
        return local + dtd
    return local + [dtd]  # pragma: no cover
示例#4
0
def load_sentiment_dataset(cache="."):
    """
    Retourne un ensemble de phrases en anglais avec
    assorties d'un sentiment positif ou négatif.
    Source :
    `Sentiment Labelled Sentences Data Set <https://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences>`_.

    @param          cache       where to cache or unzip the data if downloaded a second time
    @return                     text content (str)
    """
    from pyensae.datasource import download_data
    # url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00331/"
    name = "sentiment_labelled_sentences.zip"
    res = download_data(name, whereTo=cache)
    if len(res) != 9:
        raise ValueError("Unzipping '{0}' failed.".format(name))
    dfs = []
    for fi in res:
        if ".txt" not in fi or "readme" in fi or "__MACOSX" in fi:
            continue
        df = pandas.read_csv(fi,
                             sep='\t',
                             quoting=3,
                             names=['sentance', 'sentiment'])
        df["source"] = os.path.splitext(os.path.split(fi)[-1])[0]
        dfs.append(df)
    return pandas.concat(dfs)
示例#5
0
def load_sentiment_dataset(cache="."):
    """
    Retourne un ensemble de phrases en anglais avec
    assorties d'un sentiment positif ou négatif.
    Source :
    `Sentiment Labelled Sentences Data Set <https://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences>`_.

    @param          cache       where to cache or unzip the data if downloaded a second time
    @return                     text content (str)
    """
    from pyensae.datasource import download_data
    # url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00331/"
    name = "sentiment_labelled_sentences.zip"
    res = download_data(name, whereTo=cache)
    if len(res) != 9:
        raise ValueError("Unzipping '{0}' failed.".format(name))
    dfs = []
    for fi in res:
        if ".txt" not in fi or "readme" in fi or "__MACOSX" in fi:
            continue
        df = pandas.read_csv(fi, sep='\t', quoting=3,
                             names=['sentance', 'sentiment'])
        df["source"] = os.path.splitext(os.path.split(fi)[-1])[0]
        dfs.append(df)
    return pandas.concat(dfs)
示例#6
0
def data_cpt_ENSAE_2016_11(folder=".", fLOG=noLOG):
    """
    Returns the data for the competition
    `Python 2A ENSAE 2016 <https://competitions.codalab.org/competitions/13301>`_,
    located on github `ensae_competition_2016.zip
    <https://github.com/sdpython/ensae_teaching_cs/raw/master/_doc/competitions/
    2016_ENSAE_2A/ensae_competition_2016.zip>`_.

    @param      folder      where to download and unzip
    @param      fLOG        logging function
    @return                 2 dataframes, one with X, Y, the others one with only X
    """
    url = "https://github.com/sdpython/ensae_teaching_cs/raw/master/_doc/competitions/2016_ENSAE_2A/"
    file = "ensae_competition_2016.zip"
    files = download_data(file, url=url, whereTo=folder, fLOG=fLOG)
    df1 = pandas.read_csv(
        [f for f in files if f.endswith("ensae_competition_train.txt")][0],
        header=[0, 1],
        sep="\t",
        index_col=0)
    df2 = pandas.read_csv([f for f in files if "test_X" in f][0],
                          header=[0, 1],
                          sep="\t",
                          index_col=0)
    return df1, df2
def get_seattle_streets(filename=None, folder="."):
    """
    Retrieve processed data from
    `Seattle Streets <https://data.seattle.gov/dataset/Street-Network-Database/afip-2mzr/data)>`_.

    @param      filename        local filename
    @param      folder          temporary folder where to download files
    @return                     shapes, records

    The function returns
    """
    if filename is None:
        download_data("WGS84_seattle_street.zip", whereTo=folder)
        filename = os.path.join(folder, "Street_Network_Database.shp")
    elif not os.path.exists(filename):
        raise FileNotFoundError(filename)
    return filename
示例#8
0
def get_data(whereTo=".", timeout=None, fLOG=noLOG):
    """
    Retourne les données des rues de Paris. On suppose que les arcs sont uniques
    et qu'il si :math:`j \\rightarrow k` est présent, :math:`j \\rightarrow k` ne l'est pas.
    Ceci est vérifié par un test.

    @param      whereTo         répertoire dans lequel télécharger les données
    @param      timeout         timeout (seconds) when estabishing the connection
    @param      fLOG            fonction de logging
    @return                     liste d'arcs

    Un arc est défini par un 6-uple contenant les informations suivantes :

    - v1: indice du premier noeud
    - v2: indice du second noeud
    - ways: sens unique ou deux sens
    - p1: coordonnées du noeud 1
    - p2: coordonnées du noeud 2
    - d: distance

    """
    from pyensae.datasource import download_data
    data = download_data("paris_54000.zip",
                         whereTo=whereTo,
                         fLOG=fLOG,
                         timeout=timeout)
    name = data[0]
    with open(name, "r") as f:
        lines = f.readlines()

    vertices = []
    edges = []
    for i, line in enumerate(lines):
        spl = line.strip("\n\r").split(" ")
        if len(spl) == 2:
            vertices.append((float(spl[0]), float(spl[1])))
        elif len(spl) == 5 and i > 0:
            v1, v2 = int(spl[0]), int(spl[1])
            ways = int(spl[2])  # dans les deux sens ou pas
            p1 = vertices[v1]
            p2 = vertices[v2]
            edges.append((v1, v2, ways, p1, p2,
                          distance_haversine(p1[0], p1[1], p2[0], p2[1])))
        elif i > 0:
            raise Exception("unable to interpret line {0}: ".format(i) + line)

    pairs = {}
    for e in pairs:
        p = e[:2]
        if p in pairs:
            raise ValueError("unexpected pairs, already present: " + str(e))
        pairs[p] = True

    return edges
示例#9
0
 def test_mobilenet(self):
     src = ("https://s3.amazonaws.com/onnx-model-zoo/mobilenet/"
            "mobilenetv2-1.0/")
     model_file = "mobilenetv2-1.0.onnx"
     download_data(model_file, website=src)
     X = numpy.random.rand(1, 3, 224, 224).astype(dtype=numpy.float32)
     rts = [
         'python', 'python_compiled_debug', 'python_compiled',
         'onnxruntime1'
     ]
     res = []
     for i, rt in enumerate(rts):
         oinf = OnnxInference(model_file, runtime=rt)
         self.assertNotEmpty(oinf)
         self.assertEqual(oinf.input_names[:1], ['data'])
         if hasattr(oinf, 'inits_'):
             self.assertIn("mobilenetv20_features_conv0_weight",
                           oinf.inits_)
             self.assertEqualArray(
                 (0, -1), oinf.inits_["reshape_attr_tensor421"]['value'])
         name = oinf.input_names[0]
         out = oinf.output_names[0]
         if 'debug' in rt:
             Y, stdout, _ = self.capture(lambda oi=oinf: oi.run({name: X}))  # pylint: disable=W0640
             self.assertIn('-=', stdout)
         else:
             Y = oinf.run({name: X})
         if any(map(numpy.isnan, Y[out].ravel())):
             raise AssertionError("Runtime {}:{} produces NaN.\n{}".format(
                 i, rt, Y[out]))
         res.append((rt, Y[out]))
     for rt, r in res[1:]:
         exp = numpy.squeeze(r[0])
         got = numpy.squeeze(r)
         try:
             self.assertEqual(exp.shape, got.shape)
             self.assertEqualArray(got, exp)
         except AssertionError as e:
             raise AssertionError(
                 "Issue with runtime: '{}'.".format(rt)) from e
示例#10
0
def elections_legislatives_bureau_vote(source=None, folder=".", fLOG=noLOG):
    """
    Retrieves data from
    `Résultat des élections législatives françaises de 2012 au niveau bureau de vote
    <https://www.data.gouv.fr/fr/datasets/resultat-des-elections-legislatives-francaises-de-2012-au-niveau-bureau-de-vote-nd/>`_.

    @param      source  should be None unless you want to use the backup plan ("xd")
    @param      folder  where to download
    @return             list of dataframe

    Others sources:

    * `Résultats élections municipales 2014 par bureau de vote
      <http://www.nosdonnees.fr/dataset/resultats-elections-municipales-2014-par-bureau-de-vote>`_
    * `Elections 2015 - Découpage des bureaux de Vote
      <https://www.data.gouv.fr/fr/datasets/elections-2015-decoupage-des-bureaux-de-vote/>`_
    * `Contours des cantons électoraux départementaux 2015
      <https://www.data.gouv.fr/fr/datasets/contours-des-cantons-electoraux-departementaux-2015/>`_
    * `Découpage électoral de la commune, pour les élections législatives
      <https://www.data.gouv.fr/fr/datasets/circonscriptions/>`_ (weird bizarre)
    * `Statistiques démographiques INSEE sur les nouvelles circonscriptions législatives de 2012
      <https://www.data.gouv.fr/fr/datasets/statistiques-demographiques-insee
      -sur-les-nouvelles-circonscriptions-legislatives-de-2012-nd/>`_
    """
    if source is None:
        try:  # pragma: no cover
            with urllib.request.urlopen("http://www.nosdonnees.fr/") as f:
                url = "http://www.nosdonnees.fr/storage/f/2013-03-05T184148/"
                if f is None:
                    raise Exception(
                        "Not sure we can continue. Pretty sure we should stop."
                    )
        except (urllib.error.HTTPError,
                RemoteDisconnected):  # pragma: no cover
            url = "xd"
        file = "LG12_BV_T1T2.zip"
    else:
        url = source
        file = "LG12_BV_T1T2.zip"
    data = download_data(file, website=url, whereTo=folder, fLOG=fLOG)
    res = {}
    for d in data:
        df = pandas.read_csv(d, encoding="latin-1", sep=";", low_memory=False)
        if d.endswith("_T2.txt"):
            key = "T2"
        elif d.endswith("_T1.txt"):
            key = "T1"
        else:
            raise ValueError(  # pragma: no cover
                "Unable to guess key for filename: '{0}'".format(d))
        res[key] = df
    return res
示例#11
0
def load_irep(cache="."):
    """
    Télécharge les données du `registre des émissions polluantes (IREP)
    <http://www.georisques.gouv.fr/dossiers/irep/telechargement#>`_
    pour les années 2003-2017.

    @param          cache       where to cache or unzip the data if downloaded a second time
    @return                     list of files
    """
    from pyensae.datasource import download_data
    name = "irep.zip"
    res = download_data(name, whereTo=cache)
    return res
    def test_wolf_backup(self):
        fLOG(__file__,
             self._testMethodName,
             OutputPrint=__name__ == "__main__")

        temp = get_temp_folder(__file__, "temp_wolf_backup")
        from pyensae.datasource import download_data
        res = download_data("wolf-1.0b4.xml.zip", whereTo=temp, fLOG=fLOG)
        self.assertEqual(len(res), 1)
        outfile = [os.path.join(temp, _) for _ in os.listdir(temp)]
        self.assertTrue(len(outfile) != 0)
        if os.stat(outfile[0]).st_size < 1000000:
            raise Exception("small size")
示例#13
0
    def test_fairtest(self):
        fLOG(__file__,
             self._testMethodName,
             OutputPrint=__name__ == "__main__")

        # ete3 is needed by fairtest.
        import ete3
        assert ete3 is not None
        # We check it is working. Otherwise, you should remove PyQt5.

        try:
            import fairtest as skip___
        except ImportError:
            path = os.path.normpath(
                os.path.abspath(
                    os.path.join(
                        os.path.split(__file__)[0], "..", "..", "..",
                        "fairtest", "src")))
            if path not in sys.path:
                sys.path.append(path)
            import fairtest as skip___

        if is_travis_or_appveyor():
            # no fair test
            return

        from fairtest import DataSource, Testing, train, test, report  # pylint: disable=E0401
        temp = get_temp_folder(__file__, "temp_fairtest")
        data = ds.download_data(
            "adult.data",
            url=
            "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/",
            whereTo=os.path.join(temp, ".."))
        names = (
            "age,Workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,"
            +
            "race,sex,capital-gain,capital-loss,hours-per-week,native-country,income"
        ).split(",")
        df = pandas.read_csv(data, names=names)
        if df.shape[0] > 100:
            fLOG(df.shape)
            df = df[:100]
        data = DataSource(df, budget=1, conf=0.95)
        SENS = ['sex', 'race']  # Protected features
        TARGET = 'income'  # Output
        EXPL = ''  # Explanatory feature

        inv = Testing(data, SENS, TARGET, EXPL)
        train([inv])
        test([inv])
        report([inv], "adult", temp)
示例#14
0
def load_enedis_dataset(dest='.', fLOG=None):
    """
    Retourne des données extraites du site :epkg:`Enedis` :
    `Production électrique annuelle par filière à la maille commune
    <https://data.enedis.fr/explore/dataset/production-electrique-par-filiere-a-la-maille-commune/export/>`_.
    Le jeu proposé est un extrait pour les années 2015-2016.
    Le téléchargement utilise le module :epkg:`pyensae`.
    Notebooks associés à ce jeu de données :

    .. runpython::
        :rst:

        from papierstat.datasets.documentation import list_notebooks_rst_links
        links = list_notebooks_rst_links('visualisation', 'enedis')
        links = ['    * %s' % s for s in links]
        print('\\n'.join(links))

    @param      dest    répertoire de destination
    @param      fLOG    fonction de log
    @return             :epkg:`pandas:DataFrame`
    """
    from pyensae.datasource import download_data
    name = "production-electrique-par-filiere-a-la-maille-commune.extrait.2015-2016.csv.zip"
    if fLOG:
        res = download_data(name, whereTo=dest, fLOG=fLOG)
    else:
        res = download_data(name, whereTo=dest)
    if len(res) != 1:
        raise ValueError(  # pragma: no cover
            "Unzipping '{0}' failed.".format(name))
    df = pandas.read_csv(res[0], sep=';', encoding='utf-8')
    df['long'] = df['Geo Point 2D'].apply(
        lambda x: float(x.split(',')[1].strip()))
    df['lat'] = df['Geo Point 2D'].apply(
        lambda x: float(x.split(',')[0].strip()))
    return df
示例#15
0
def data_cpt_ENSAE_2016_11(folder=".", fLOG=noLOG):
    """
    returns the data for the competition
    `Python 2A ENSAE 2016 <https://competitions.codalab.org/competitions/13301>`_,
    located on github `ensae_competition_2016.zip <https://github.com/sdpython/ensae_teaching_cs/raw/master/_doc/competitions/2016_ENSAE_2A/ensae_competition_2016.zip>`_.

    @param      folder      where to download and unzip
    @param      fLOG        logging function
    @return                 2 dataframes, one with X, Y, the others one with only X
    """
    url = "https://github.com/sdpython/ensae_teaching_cs/raw/master/_doc/competitions/2016_ENSAE_2A/"
    file = "ensae_competition_2016.zip"
    files = download_data(file, url=url, whereTo=folder, fLOG=fLOG)
    df1 = pandas.read_csv(
        [f for f in files if f.endswith("ensae_competition_train.txt")][0], header=[0, 1], sep="\t", index_col=0
    )
    df2 = pandas.read_csv([f for f in files if "test_X" in f][0], header=[0, 1], sep="\t", index_col=0)
    return df1, df2
示例#16
0
    def test_euler(self):
        fLOG(__file__,
             self._testMethodName,
             OutputPrint=__name__ == "__main__")
        folder = os.path.join(os.path.abspath(os.path.dirname(__file__)),
                              "temp_rues_euler")
        if not os.path.exists(folder):
            os.mkdir(folder)
        edges = get_data(whereTo=folder, fLOG=fLOG)

        data = download_data("added.zip", whereTo=folder, fLOG=fLOG)
        with open(data[0], "r") as f:
            text = f.read()
        added_edges = eval(text)
        path = euler_path(edges, added_edges)
        fLOG(len(path), len(edges) + len(added_edges))
        for p in path[:5]:
            fLOG(len(p), p)
        for p in path[-5:]:
            fLOG(len(p), p)
示例#17
0
def elections_vote_places_geo(source="xd", folder=".", fLOG=noLOG):
    """
    Retrieves data vote places (bureaux de vote in French)
    with geocodes.

    @param      source  should be None unless you want to use the backup plan ("xd")
    @param      folder  where to download
    @param      fLOG    logging function
    @return             list of dataframe
    """
    if source is None:
        raise NotImplementedError("use source='xd'")
    url = source
    file = "bureauxvotegeo.zip"
    data = download_data(file, website=url, whereTo=folder, fLOG=fLOG)
    for d in data:
        if d.endswith(".txt"):
            df = pandas.read_csv(d, sep="\t", encoding="utf-8")
            return df
    raise DataNotAvailableError(
        "Unable to find any csv file in '{0}'".format(file))
示例#18
0
def load_sentiment_dataset(cache="."):
    """
    Retourne un ensemble de phrases en anglais avec
    assorties d'un sentiment positif ou négatif.
    Source :
    `Sentiment Labelled Sentences Data Set <https://archive.ics.uci.edu/
    ml/datasets/Sentiment+Labelled+Sentences>`_.
    Notebooks associés à ce jeu de données :

    .. runpython::
        :rst:

        from papierstat.datasets.documentation import list_notebooks_rst_links
        links = list_notebooks_rst_links('lectures', 'text_sentiment_wordvec')
        links = ['    * %s' % s for s in links]
        print('\\n'.join(links))

    @param          cache       where to cache or unzip the data if
                                downloaded a second time
    @return                     text content (str)
    """
    from pyensae.datasource import download_data
    # url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00331/"
    name = "sentiment_labelled_sentences.zip"
    res = download_data(name, whereTo=cache)
    if len(res) != 9:
        raise ValueError(  # pragma: no cover
            "Unzipping '{0}' failed.".format(name))
    dfs = []
    for fi in res:
        if ".txt" not in fi or "readme" in fi or "__MACOSX" in fi:
            continue
        df = pandas.read_csv(fi,
                             sep='\t',
                             quoting=3,
                             names=['sentence', 'sentiment'])
        df["source"] = os.path.splitext(os.path.split(fi)[-1])[0]
        dfs.append(df)
    return pandas.concat(dfs)
示例#19
0
    def test_euler(self):
        fLOG(
            __file__,
            self._testMethodName,
            OutputPrint=__name__ == "__main__")
        folder = os.path.join(
            os.path.abspath(
                os.path.dirname(__file__)),
            "temp_rues_euler")
        if not os.path.exists(folder):
            os.mkdir(folder)
        edges = get_data(whereTo=folder, fLOG=fLOG)

        data = download_data("added.zip", whereTo=folder, fLOG=fLOG)
        with open(data[0], "r") as f:
            text = f.read()
        added_edges = eval(text)
        path = euler_path(edges, added_edges)
        fLOG(len(path), len(edges) + len(added_edges))
        for p in path[:5]:
            fLOG(len(p), p)
        for p in path[-5:]:
            fLOG(len(p), p)
示例#20
0
def elections_legislatives_circonscription_geo(source="xd",
                                               folder=".",
                                               fLOG=noLOG):
    """
    Retrieves data from
    `Countours des circonscriptions des législatives <https://www.data.gouv.fr/fr/
    datasets/countours-des-circonscriptions-des-legislatives-nd/>`_.

    @param      source  should be None unless you want to use the backup plan ("xd")
    @param      folder  where to download
    @return             list of dataframe
    """
    if source is None:
        raise NotImplementedError(  # pragma: no cover
            "use source='xd'")
    url = source
    file = "toxicode_circonscriptions_legislatives.zip"
    data = download_data(file, website=url, whereTo=folder, fLOG=fLOG)
    for d in data:
        if d.endswith(".csv"):
            df = pandas.read_csv(d, sep=",", encoding="utf-8")
            return df
    raise DataNotAvailableError(
        "unable to find any csv file in '{0}'".format(file))
#coding:latin-1
import sys, datetime
sys.path.append("../../../../program/python/pyensae/src")

from pyensae.datasource import download_data

print("A", datetime.datetime.now())
download_data("SQLiteSpy.zip", website='xd')
print("B", datetime.datetime.now())
download_data("td8_velib.zip", website='xd')
print("C", datetime.datetime.now())

from pyensae import import_flatfile_into_database
dbf = "td8_velib2.db3"
if False:
    print("import", datetime.datetime.now())
    import_flatfile_into_database(dbf, "td8_velib.txt")
    print("import", datetime.datetime.now())
    import_flatfile_into_database(dbf, "stations.txt", table="stations")
    print("import", datetime.datetime.now())

if False:
    import sqlite3
    conn = sqlite3.connect(dbf)
    data = conn.execute("SELECT * FROM stations")
    for d in data:
        print(d)
    conn.close()
示例#22
0
def table_mortalite_euro_stat(url="http://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?file=data/",
                              name="demo_mlifetable.tsv.gz", final_name="mortalite.txt",
                              whereTo=".", stop_at=None, fLOG=noLOG):
    """
    This function retrieves mortality table from `EuroStat <http://ec.europa.eu/eurostat/fr>`_ through
    `table de mortalité <http://www.data-publica.com/opendata/7098--population-et-conditions-sociales-table-de-mortalite-de-1960-a-2010>`_
    (*this link is currently broken, data-publica does not provide such a database anymore, a copy is provided*).

    @param      url         data source
    @param      name        data table name
    @param      final_name  the data is compressed, it needs to be uncompressed into a file,
                            this parameter defines its name
    @param      whereTo     data needs to be downloaded, location of this place
    @param      stop_at     the overall process is quite long, if not None, it only keeps the first rows
    @param      fLOG        logging function
    @return                 data_frame

    The function checks the file final_name exists.
    If it is the case, the data is not downloaded twice.

    The header contains a weird format as coordinates are separated by a comma::

        indic_de,sex,age,geo\time	2013 	2012 	2011 	2010 	2009

    We need to preprocess the data to split this information into columns.
    The overall process takes 4-5 minutes, 10 seconds to download (< 10 Mb),
    4-5 minutes to preprocess the data (it could be improved). The processed data
    contains the following columns::

        ['annee', 'valeur', 'age', 'age_num', 'indicateur', 'genre', 'pays']

    Columns *age* and *age_num* look alike. *age_num* is numeric and is equal
    to *age* except when *age_num* is 85. Everybody above that age fall into the same category.
    The table contains many indicators:

    * PROBSURV: Probabilité de survie entre deux âges exacts (px)
    * LIFEXP: Esperance de vie à l'âge exact (ex)
    * SURVIVORS: Nombre des survivants à l'âge exact (lx)
    * PYLIVED: Nombre d'années personnes vécues entre deux âges exacts (Lx)
    * DEATHRATE: Taux de mortalité à l'âge x (Mx)
    * PROBDEATH: Probabilité de décès entre deux âges exacts (qx)
    * TOTPYLIVED: Nombre total d'années personne vécues après l'âge exact (Tx)
    """
    if os.path.exists(final_name) and os.stat(final_name).st_size > 1e7:
        return final_name

    temp = final_name + ".remove.txt"
    if not os.path.exists(temp) or os.stat(temp).st_size < 1e7:
        local = download_data(name, url=url, whereTo=whereTo)
        local = local[0] + ".gz"
        with gzip.open(local, 'rb') as f:
            file_content = f.read()
        content = str(file_content, encoding="utf8")
        with open(temp, "w", encoding="utf8") as f:
            f.write(content)

    def format_age(s):
        if s.startswith("Y_"):
            if s.startswith("Y_LT"):
                return "YLT" + s[4:]
            elif s.startswith("Y_GE"):
                return "YGE" + s[4:]
            else:
                raise SyntaxError(s)
        else:
            i = int(s.strip("Y"))
            return "Y%02d" % i

    def format_age_num(s):
        if s.startswith("Y_"):
            if s.startswith("Y_LT"):
                return float(s.replace("Y_LT", ""))
            elif s.startswith("Y_GE"):
                return float(s.replace("Y_GE", ""))
            else:
                raise SyntaxError(s)
        else:
            i = int(s.strip("Y"))
            return float(i)

    def format_value(s):
        if s.strip() == ":":
            return numpy.nan
        else:
            return float(s.strip(" ebp"))

    fLOG("step 0, reading")
    dff = pandas.read_csv(temp, sep="\t", encoding="utf8")

    if stop_at is not None:
        fLOG("step 0, shortening")
        dfsmall = dff.head(n=stop_at)
        df = dfsmall
    else:
        df = dff

    fLOG("step 1, size=", df.shape)
    dfi = df.reset_index().set_index("indic_de,sex,age,geo\\time")
    dfi = dfi.drop('index', axis=1)
    dfs = dfi.stack()
    dfs = pandas.DataFrame({"valeur": dfs})

    fLOG("step 2, size=", dfs.shape)
    dfs["valeur"] = dfs["valeur"].astype(str)
    dfs["valeur"] = dfs["valeur"].apply(format_value)
    dfs = dfs[dfs.valeur >= 0].copy()
    dfs = dfs.reset_index()
    dfs.columns = ["index", "annee", "valeur"]

    fLOG("step 3, size=", dfs.shape)
    dfs["age"] = dfs["index"].apply(lambda i: format_age(i.split(",")[2]))
    dfs["age_num"] = dfs["index"].apply(
        lambda i: format_age_num(i.split(",")[2]))
    dfs["indicateur"] = dfs["index"].apply(lambda i: i.split(",")[0])
    dfs["genre"] = dfs["index"].apply(lambda i: i.split(",")[1])
    dfs["pays"] = dfs["index"].apply(lambda i: i.split(",")[3])

    fLOG("step 4")
    dfy = dfs.drop('index', axis=1)
    dfy.to_csv(final_name, sep="\t", encoding="utf8", index=False)
    return final_name
示例#23
0
def elections_vote_place_address(folder=".", hide_warning=False, fLOG=noLOG):
    """
    Scrapes and extracts addresses for every vote place (bureau de vote in French).

    @param      folder          where to download the scraped pages
    @param      hide_warnings   hide warnings
    @param      fLOG            logging function
    @return                     dictionary

    The function does not retrieve everything due to the irregular format.
    Sometimes, the city is missing or written above.
    """
    _elections_vote_place_address_patterns = _elections_vote_place_address_patterns_(
    )

    files = []
    for deps in range(1, 96):
        last = "bureaudevote%02d.htm" % deps
        url = "http://bureaudevote.fr/"
        try:
            f = download_data(last, website=url, whereTo=folder, fLOG=fLOG)
        except (urllib.error.HTTPError,
                DownloadDataException):  # pragma: no cover
            # backup plan
            files = download_data("bureauxdevote.zip",
                                  website="xd",
                                  whereTo=folder,
                                  fLOG=fLOG)
            break
        if isinstance(f, list):
            f = f[0]
        files.append(f)

    # extract data
    regex = [re.compile(_) for _ in _elections_vote_place_address_patterns]
    rows = []
    exc = []
    for data in files:
        lrows = []
        with open(data, "r", encoding="iso-8859-1") as f:
            content = f.read().lower()
        content = html_to_text(content)
        content0 = content
        content = content.replace("\n", " ").replace("\t", " ")
        atous = []
        for reg in regex:
            atous.extend(reg.findall(content))
        if len(atous) < 4 and len(atous) > 0:
            mes = "Not enough vote places ({2}) in\n{0}\nFOUND\n{3}\nCONTENT\n{1}".format(
                data, content0, len(atous), "\n".join(str(_) for _ in atous))
            exc.append(Exception(mes))
        if len(atous) > 1:
            for t in atous:
                ad = t[-3].split("-")
                address = ad[-1].strip(" ./<>-")
                place = "-".join(ad[:-1]).strip(" ./<>-")
                if "bureau de vote" in place:
                    if not hide_warning:
                        warnings.warn("Too long address {0}".format(t))
                else:
                    try:
                        lrows.append(
                            dict(n=int(t[1]),
                                 city=t[-1].strip(" .<>/"),
                                 zip=t[-2],
                                 address=address,
                                 place=place))
                    except ValueError as e:  # pragma: no cover
                        raise DataFormatException(
                            "issue with {0}".format(t)) from e
                    if len(lrows[-1]["city"]) <= 1:
                        mes = "No City in {0}\nROWS\n{2}\nCONTENT\n{1}".format(
                            t, content0,
                            "\n".join(str(_)
                                      for _ in lrows))  # pragma: no cover
                        raise DataFormatException(mes)  # pragma: no cover
        if lrows:
            rows.extend(lrows)
        elif "06.htm" in data:
            mes = "Not enough vote places ({2}) in\n{0}\nFOUND\n{3}\nCONTENT\n{1}".format(
                data, content0, len(lrows),
                "\n".join(str(_) for _ in lrows))  # pragma: no cover
            raise DataFormatException(mes)  # pragma: no cover
    if len(exc) > 2:
        mes = "Exception raised: {0}\n---------\n{1}".format(  # pragma: no cover
            len(exc), "\n########################\n".join(str(_) for _ in exc))
        raise DataFormatException(mes)  # pragma: no cover
    return pandas.DataFrame(rows)
示例#24
0
    def test_matplotlib_example(self):
        fLOG(
            __file__,
            self._testMethodName,
            OutputPrint=__name__ == "__main__")

        progs = ["ffmpeg"]
        if not sys.platform.startswith("win"):
            progs.append("avconv")
        errs = []
        prog = None
        for prog in progs:
            out, err = run_cmd(prog, wait=True, fLOG=fLOG)
            exps = "usage:"
            if (exps not in out and exps not in err) or err is None or len(err) == 0:
                errs.append((prog, err))
            else:
                break

        if len(errs) >= len(progs):
            if sys.platform.startswith("win"):
                fLOG("download ffmpeg")
                add_missing_development_version(
                    ["pyensae"], __file__, hide=True)
                from pyensae.datasource import download_data
                download_data("ffmpeg.zip", website="xd")
            else:
                raise FileNotFoundError(
                    "Unable to find '{1}'.\nPATH='{0}'\n--------\n[OUT]\n{2}\n[ERR]\n{3}".format(
                        os.environ["PATH"], prog, out,
                        "\n----\n".join("{0}:\n{1}".format(*_) for _ in errs)))

        temp = get_temp_folder(__file__, "temp_example_example")
        fix_tkinter_issues_virtualenv()

        # update a distribution based on new data.
        import numpy as np
        import matplotlib.pyplot as plt
        import scipy.stats as ss
        from matplotlib.animation import FuncAnimation, writers

        # To get the list of available writers
        if not writers.is_available(prog):
            writers.register(prog)
        fLOG(writers.list())

        class UpdateDist:

            def __init__(self, ax, prob=0.5):
                self.success = 0
                self.prob = prob
                self.line, = ax.plot([], [], 'k-')
                self.x = np.linspace(0, 1, 200)
                self.ax = ax

                # Set up plot parameters
                self.ax.set_xlim(0, 1)
                self.ax.set_ylim(0, 15)
                self.ax.grid(True)

                # This vertical line represents the theoretical value, to
                # which the plotted distribution should converge.
                self.ax.axvline(prob, linestyle='--', color='black')

            def init(self):
                self.success = 0
                self.line.set_data([], [])
                return self.line,

            def __call__(self, i):
                # This way the plot can continuously run and we just keep
                # watching new realizations of the process
                if i == 0:
                    return self.init()

                # Choose success based on exceed a threshold with a uniform
                # pick
                if np.random.rand(1,) < self.prob:  # pylint: disable=W0143
                    self.success += 1
                y = ss.beta.pdf(self.x, self.success + 1,
                                (i - self.success) + 1)
                self.line.set_data(self.x, y)
                return self.line,

        fig = plt.figure()
        ax = fig.add_subplot(1, 1, 1)
        ud = UpdateDist(ax, prob=0.7)
        anim = FuncAnimation(fig, ud, frames=np.arange(100), init_func=ud.init,
                             interval=100, blit=True)

        try:
            Writer = writers[prog]
        except KeyError as e:
            if prog == "avconv":
                from matplotlib.animation import AVConvWriter
                Writer = AVConvWriter
            else:
                raise e
        writer = Writer(fps=15, metadata=dict(artist='Me'), bitrate=1800)
        anim.save(os.path.join(temp, 'lines2.mp4'), writer=writer)

        plt.close('all')
        fLOG("end")
    def test_matplotlib_example(self):
        fLOG(
            __file__,
            self._testMethodName,
            OutputPrint=__name__ == "__main__")

        progs = ["ffmpeg"]
        if not sys.platform.startswith("win"):
            progs.append("avconv")
        errs = []
        prog = None
        for prog in progs:
            out, err = run_cmd(prog, wait=True, fLOG=fLOG)
            exps = "usage:"
            if (exps not in out and exps not in err) or err is None or len(err) == 0:
                errs.append((prog, err))
            else:
                break

        if len(errs) >= len(progs):
            if sys.platform.startswith("win"):
                fLOG("download ffmpeg")
                add_missing_development_version(
                    ["pyensae"], __file__, hide=True)
                from pyensae.datasource import download_data
                download_data("ffmpeg.zip", website="xd")
            else:
                raise FileNotFoundError(
                    "Unable to find '{1}'.\nPATH='{0}'\n--------\n[OUT]\n{2}\n[ERR]\n{3}".format(
                        os.environ["PATH"], prog, out,
                        "\n----\n".join("{0}:\n{1}".format(*_) for _ in errs)))

        temp = get_temp_folder(__file__, "temp_example_example")
        fix_tkinter_issues_virtualenv()

        # update a distribution based on new data.
        import numpy as np
        import matplotlib.pyplot as plt
        import scipy.stats as ss
        from matplotlib.animation import FuncAnimation, writers

        # To get the list of available writers
        if not writers.is_available(prog):
            writers.register(prog)
        fLOG(writers.list())

        class UpdateDist:

            def __init__(self, ax, prob=0.5):
                self.success = 0
                self.prob = prob
                self.line, = ax.plot([], [], 'k-')
                self.x = np.linspace(0, 1, 200)
                self.ax = ax

                # Set up plot parameters
                self.ax.set_xlim(0, 1)
                self.ax.set_ylim(0, 15)
                self.ax.grid(True)

                # This vertical line represents the theoretical value, to
                # which the plotted distribution should converge.
                self.ax.axvline(prob, linestyle='--', color='black')

            def init(self):
                self.success = 0
                self.line.set_data([], [])
                return self.line,

            def __call__(self, i):
                # This way the plot can continuously run and we just keep
                # watching new realizations of the process
                if i == 0:
                    return self.init()

                # Choose success based on exceed a threshold with a uniform
                # pick
                if np.random.rand(1,) < self.prob:  # pylint: disable=W0143
                    self.success += 1
                y = ss.beta.pdf(self.x, self.success + 1,
                                (i - self.success) + 1)
                self.line.set_data(self.x, y)
                return self.line,

        fig = plt.figure()
        ax = fig.add_subplot(1, 1, 1)
        ud = UpdateDist(ax, prob=0.7)
        anim = FuncAnimation(fig, ud, frames=np.arange(100), init_func=ud.init,
                             interval=100, blit=True)

        try:
            Writer = writers[prog]
        except KeyError as e:
            if prog == "avconv":
                from matplotlib.animation import AVConvWriter
                Writer = AVConvWriter
            else:
                raise e
        writer = Writer(fps=15, metadata=dict(artist='Me'), bitrate=1800)
        anim.save(os.path.join(temp, 'lines2.mp4'), writer=writer)

        plt.close('all')
        fLOG("end")
示例#26
0
#coding:latin-1
import sys
sys.path.append("../../../../program/python/pyensae/src")  # ligne inutile

from pyensae.datasource import download_data
import pandas

download_data("td9_data.zip", website = 'xd')
file1 = "td9_full.txt"
tbl = pandas.read_csv (file1, sep = "\t")

from pandas.tools.plotting import scatter_plot

gr = tbl.groupby(['lng','lat'], as_index = False).agg(lambda x: len(x))

# voir http://dev.openlayers.org/docs/files/OpenLayers/Marker-js.html pour changer le marker
html = """
<html><body>
  <div id="mapdiv"></div>
  <script src="http://www.openlayers.org/api/OpenLayers.js"></script>
  <script>
    map = new OpenLayers.Map("mapdiv");
    map.addLayer(new OpenLayers.Layer.OSM());
    var proj =  new OpenLayers.Projection("EPSG:4326");
 
    var zoom=13;
 
    var markers = new OpenLayers.Layer.Markers( "Markers" );
    map.addLayer(markers);
    
    __VELIB__