def test_theano_logreg(self): fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__") if is_travis_or_appveyor(): # it requires latex return from theano import config fLOG(config) from src.ensae_teaching_cs.examples.theano_logreg import theano_sgd_optimization_mnist, theano_predict temp = get_temp_folder(__file__, "temp__theano_logreg") dataset = "mnist.pkl.gz" if not os.path.exists(dataset): download_data(dataset, website="http://deeplearning.net/data/mnist/") model = os.path.join(temp, "log_reg_theano.bin") theano_sgd_optimization_mnist(dataset=dataset, saved_model=model, n_epochs=2, fLOG=fLOG) pred = theano_predict(model, dataset, 10) fLOG(pred) fLOG(type(pred)) self.assertEqual(len(pred), 10)
def test_theano_logreg(self): fLOG( __file__, self._testMethodName, OutputPrint=__name__ == "__main__") if is_travis_or_appveyor(): # it requires latex return from theano import config fLOG(config) from src.ensae_teaching_cs.examples.theano_logreg import theano_sgd_optimization_mnist, theano_predict temp = get_temp_folder(__file__, "temp__theano_logreg") dataset = "mnist.pkl.gz" if not os.path.exists(dataset): download_data( dataset, website="http://deeplearning.net/data/mnist/") model = os.path.join(temp, "log_reg_theano.bin") theano_sgd_optimization_mnist( dataset=dataset, saved_model=model, n_epochs=2, fLOG=fLOG) pred = theano_predict(model, dataset, 10) fLOG(pred) fLOG(type(pred)) self.assertEqual(len(pred), 10)
def wolf_xml(url="http://pauillac.inria.fr/~sagot/index.html", temp_folder=".", fLOG=noLOG): """ The `WOLF <http://alpage.inria.fr/~sagot/wolf-en.html>`_ (Wordnet Libre du Français, Free French Wordnet) is a free semantic lexical resource (wordnet) for French. This data is licensed under `Cecill-C license <http://www.cecill.info/licences/Licence_CeCILL-C_V1-en.html>`_. Language is French. @param url url @param fLOG logging function @param temp_folder where to download @return list of files """ link = url page = download_page(link) reg = re.compile("href=\\\"(https.*?wolf.*?[.]bz2)\\\"") alls = reg.findall(page) if len(alls) == 0: raise LinkNotFoundError( # pragma: no cover "unable to find a link on a .bz2 file on page\n{}".format(page)) url = alls[0] spl = url.split("/") url = "/".join(spl[:-1]) + "/" url2 = "/".join(spl[:-2]) + "/31718/" try: dtd = download_data("debvisdic-strict.dtd", url=[url2, "xd"], fLOG=fLOG, whereTo=temp_folder) except DownloadDataException: dtd = None name = spl[-1].strip('.') try: local = download_data(name, url=[url, "xd"], fLOG=fLOG, whereTo=temp_folder) except DownloadDataException: local = None if local is not None and isinstance(local, str): local = [local] # We check the file was downloaded. expected = os.path.join(temp_folder, "wolf-1.0b4.xml") if local is None or not os.path.exists(expected): # pragma: no cover res = download_data("wolf-1.0b4.xml.zip", whereTo=temp_folder, fLOG=fLOG) if not os.path.exists(expected): raise FileNotFoundError(expected) return res elif isinstance(dtd, list): return local + dtd return local + [dtd] # pragma: no cover
def load_sentiment_dataset(cache="."): """ Retourne un ensemble de phrases en anglais avec assorties d'un sentiment positif ou négatif. Source : `Sentiment Labelled Sentences Data Set <https://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences>`_. @param cache where to cache or unzip the data if downloaded a second time @return text content (str) """ from pyensae.datasource import download_data # url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00331/" name = "sentiment_labelled_sentences.zip" res = download_data(name, whereTo=cache) if len(res) != 9: raise ValueError("Unzipping '{0}' failed.".format(name)) dfs = [] for fi in res: if ".txt" not in fi or "readme" in fi or "__MACOSX" in fi: continue df = pandas.read_csv(fi, sep='\t', quoting=3, names=['sentance', 'sentiment']) df["source"] = os.path.splitext(os.path.split(fi)[-1])[0] dfs.append(df) return pandas.concat(dfs)
def data_cpt_ENSAE_2016_11(folder=".", fLOG=noLOG): """ Returns the data for the competition `Python 2A ENSAE 2016 <https://competitions.codalab.org/competitions/13301>`_, located on github `ensae_competition_2016.zip <https://github.com/sdpython/ensae_teaching_cs/raw/master/_doc/competitions/ 2016_ENSAE_2A/ensae_competition_2016.zip>`_. @param folder where to download and unzip @param fLOG logging function @return 2 dataframes, one with X, Y, the others one with only X """ url = "https://github.com/sdpython/ensae_teaching_cs/raw/master/_doc/competitions/2016_ENSAE_2A/" file = "ensae_competition_2016.zip" files = download_data(file, url=url, whereTo=folder, fLOG=fLOG) df1 = pandas.read_csv( [f for f in files if f.endswith("ensae_competition_train.txt")][0], header=[0, 1], sep="\t", index_col=0) df2 = pandas.read_csv([f for f in files if "test_X" in f][0], header=[0, 1], sep="\t", index_col=0) return df1, df2
def get_seattle_streets(filename=None, folder="."): """ Retrieve processed data from `Seattle Streets <https://data.seattle.gov/dataset/Street-Network-Database/afip-2mzr/data)>`_. @param filename local filename @param folder temporary folder where to download files @return shapes, records The function returns """ if filename is None: download_data("WGS84_seattle_street.zip", whereTo=folder) filename = os.path.join(folder, "Street_Network_Database.shp") elif not os.path.exists(filename): raise FileNotFoundError(filename) return filename
def get_data(whereTo=".", timeout=None, fLOG=noLOG): """ Retourne les données des rues de Paris. On suppose que les arcs sont uniques et qu'il si :math:`j \\rightarrow k` est présent, :math:`j \\rightarrow k` ne l'est pas. Ceci est vérifié par un test. @param whereTo répertoire dans lequel télécharger les données @param timeout timeout (seconds) when estabishing the connection @param fLOG fonction de logging @return liste d'arcs Un arc est défini par un 6-uple contenant les informations suivantes : - v1: indice du premier noeud - v2: indice du second noeud - ways: sens unique ou deux sens - p1: coordonnées du noeud 1 - p2: coordonnées du noeud 2 - d: distance """ from pyensae.datasource import download_data data = download_data("paris_54000.zip", whereTo=whereTo, fLOG=fLOG, timeout=timeout) name = data[0] with open(name, "r") as f: lines = f.readlines() vertices = [] edges = [] for i, line in enumerate(lines): spl = line.strip("\n\r").split(" ") if len(spl) == 2: vertices.append((float(spl[0]), float(spl[1]))) elif len(spl) == 5 and i > 0: v1, v2 = int(spl[0]), int(spl[1]) ways = int(spl[2]) # dans les deux sens ou pas p1 = vertices[v1] p2 = vertices[v2] edges.append((v1, v2, ways, p1, p2, distance_haversine(p1[0], p1[1], p2[0], p2[1]))) elif i > 0: raise Exception("unable to interpret line {0}: ".format(i) + line) pairs = {} for e in pairs: p = e[:2] if p in pairs: raise ValueError("unexpected pairs, already present: " + str(e)) pairs[p] = True return edges
def test_mobilenet(self): src = ("https://s3.amazonaws.com/onnx-model-zoo/mobilenet/" "mobilenetv2-1.0/") model_file = "mobilenetv2-1.0.onnx" download_data(model_file, website=src) X = numpy.random.rand(1, 3, 224, 224).astype(dtype=numpy.float32) rts = [ 'python', 'python_compiled_debug', 'python_compiled', 'onnxruntime1' ] res = [] for i, rt in enumerate(rts): oinf = OnnxInference(model_file, runtime=rt) self.assertNotEmpty(oinf) self.assertEqual(oinf.input_names[:1], ['data']) if hasattr(oinf, 'inits_'): self.assertIn("mobilenetv20_features_conv0_weight", oinf.inits_) self.assertEqualArray( (0, -1), oinf.inits_["reshape_attr_tensor421"]['value']) name = oinf.input_names[0] out = oinf.output_names[0] if 'debug' in rt: Y, stdout, _ = self.capture(lambda oi=oinf: oi.run({name: X})) # pylint: disable=W0640 self.assertIn('-=', stdout) else: Y = oinf.run({name: X}) if any(map(numpy.isnan, Y[out].ravel())): raise AssertionError("Runtime {}:{} produces NaN.\n{}".format( i, rt, Y[out])) res.append((rt, Y[out])) for rt, r in res[1:]: exp = numpy.squeeze(r[0]) got = numpy.squeeze(r) try: self.assertEqual(exp.shape, got.shape) self.assertEqualArray(got, exp) except AssertionError as e: raise AssertionError( "Issue with runtime: '{}'.".format(rt)) from e
def elections_legislatives_bureau_vote(source=None, folder=".", fLOG=noLOG): """ Retrieves data from `Résultat des élections législatives françaises de 2012 au niveau bureau de vote <https://www.data.gouv.fr/fr/datasets/resultat-des-elections-legislatives-francaises-de-2012-au-niveau-bureau-de-vote-nd/>`_. @param source should be None unless you want to use the backup plan ("xd") @param folder where to download @return list of dataframe Others sources: * `Résultats élections municipales 2014 par bureau de vote <http://www.nosdonnees.fr/dataset/resultats-elections-municipales-2014-par-bureau-de-vote>`_ * `Elections 2015 - Découpage des bureaux de Vote <https://www.data.gouv.fr/fr/datasets/elections-2015-decoupage-des-bureaux-de-vote/>`_ * `Contours des cantons électoraux départementaux 2015 <https://www.data.gouv.fr/fr/datasets/contours-des-cantons-electoraux-departementaux-2015/>`_ * `Découpage électoral de la commune, pour les élections législatives <https://www.data.gouv.fr/fr/datasets/circonscriptions/>`_ (weird bizarre) * `Statistiques démographiques INSEE sur les nouvelles circonscriptions législatives de 2012 <https://www.data.gouv.fr/fr/datasets/statistiques-demographiques-insee -sur-les-nouvelles-circonscriptions-legislatives-de-2012-nd/>`_ """ if source is None: try: # pragma: no cover with urllib.request.urlopen("http://www.nosdonnees.fr/") as f: url = "http://www.nosdonnees.fr/storage/f/2013-03-05T184148/" if f is None: raise Exception( "Not sure we can continue. Pretty sure we should stop." ) except (urllib.error.HTTPError, RemoteDisconnected): # pragma: no cover url = "xd" file = "LG12_BV_T1T2.zip" else: url = source file = "LG12_BV_T1T2.zip" data = download_data(file, website=url, whereTo=folder, fLOG=fLOG) res = {} for d in data: df = pandas.read_csv(d, encoding="latin-1", sep=";", low_memory=False) if d.endswith("_T2.txt"): key = "T2" elif d.endswith("_T1.txt"): key = "T1" else: raise ValueError( # pragma: no cover "Unable to guess key for filename: '{0}'".format(d)) res[key] = df return res
def load_irep(cache="."): """ Télécharge les données du `registre des émissions polluantes (IREP) <http://www.georisques.gouv.fr/dossiers/irep/telechargement#>`_ pour les années 2003-2017. @param cache where to cache or unzip the data if downloaded a second time @return list of files """ from pyensae.datasource import download_data name = "irep.zip" res = download_data(name, whereTo=cache) return res
def test_wolf_backup(self): fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__") temp = get_temp_folder(__file__, "temp_wolf_backup") from pyensae.datasource import download_data res = download_data("wolf-1.0b4.xml.zip", whereTo=temp, fLOG=fLOG) self.assertEqual(len(res), 1) outfile = [os.path.join(temp, _) for _ in os.listdir(temp)] self.assertTrue(len(outfile) != 0) if os.stat(outfile[0]).st_size < 1000000: raise Exception("small size")
def test_fairtest(self): fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__") # ete3 is needed by fairtest. import ete3 assert ete3 is not None # We check it is working. Otherwise, you should remove PyQt5. try: import fairtest as skip___ except ImportError: path = os.path.normpath( os.path.abspath( os.path.join( os.path.split(__file__)[0], "..", "..", "..", "fairtest", "src"))) if path not in sys.path: sys.path.append(path) import fairtest as skip___ if is_travis_or_appveyor(): # no fair test return from fairtest import DataSource, Testing, train, test, report # pylint: disable=E0401 temp = get_temp_folder(__file__, "temp_fairtest") data = ds.download_data( "adult.data", url= "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/", whereTo=os.path.join(temp, "..")) names = ( "age,Workclass,fnlwgt,education,education-num,marital-status,occupation,relationship," + "race,sex,capital-gain,capital-loss,hours-per-week,native-country,income" ).split(",") df = pandas.read_csv(data, names=names) if df.shape[0] > 100: fLOG(df.shape) df = df[:100] data = DataSource(df, budget=1, conf=0.95) SENS = ['sex', 'race'] # Protected features TARGET = 'income' # Output EXPL = '' # Explanatory feature inv = Testing(data, SENS, TARGET, EXPL) train([inv]) test([inv]) report([inv], "adult", temp)
def load_enedis_dataset(dest='.', fLOG=None): """ Retourne des données extraites du site :epkg:`Enedis` : `Production électrique annuelle par filière à la maille commune <https://data.enedis.fr/explore/dataset/production-electrique-par-filiere-a-la-maille-commune/export/>`_. Le jeu proposé est un extrait pour les années 2015-2016. Le téléchargement utilise le module :epkg:`pyensae`. Notebooks associés à ce jeu de données : .. runpython:: :rst: from papierstat.datasets.documentation import list_notebooks_rst_links links = list_notebooks_rst_links('visualisation', 'enedis') links = [' * %s' % s for s in links] print('\\n'.join(links)) @param dest répertoire de destination @param fLOG fonction de log @return :epkg:`pandas:DataFrame` """ from pyensae.datasource import download_data name = "production-electrique-par-filiere-a-la-maille-commune.extrait.2015-2016.csv.zip" if fLOG: res = download_data(name, whereTo=dest, fLOG=fLOG) else: res = download_data(name, whereTo=dest) if len(res) != 1: raise ValueError( # pragma: no cover "Unzipping '{0}' failed.".format(name)) df = pandas.read_csv(res[0], sep=';', encoding='utf-8') df['long'] = df['Geo Point 2D'].apply( lambda x: float(x.split(',')[1].strip())) df['lat'] = df['Geo Point 2D'].apply( lambda x: float(x.split(',')[0].strip())) return df
def data_cpt_ENSAE_2016_11(folder=".", fLOG=noLOG): """ returns the data for the competition `Python 2A ENSAE 2016 <https://competitions.codalab.org/competitions/13301>`_, located on github `ensae_competition_2016.zip <https://github.com/sdpython/ensae_teaching_cs/raw/master/_doc/competitions/2016_ENSAE_2A/ensae_competition_2016.zip>`_. @param folder where to download and unzip @param fLOG logging function @return 2 dataframes, one with X, Y, the others one with only X """ url = "https://github.com/sdpython/ensae_teaching_cs/raw/master/_doc/competitions/2016_ENSAE_2A/" file = "ensae_competition_2016.zip" files = download_data(file, url=url, whereTo=folder, fLOG=fLOG) df1 = pandas.read_csv( [f for f in files if f.endswith("ensae_competition_train.txt")][0], header=[0, 1], sep="\t", index_col=0 ) df2 = pandas.read_csv([f for f in files if "test_X" in f][0], header=[0, 1], sep="\t", index_col=0) return df1, df2
def test_euler(self): fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__") folder = os.path.join(os.path.abspath(os.path.dirname(__file__)), "temp_rues_euler") if not os.path.exists(folder): os.mkdir(folder) edges = get_data(whereTo=folder, fLOG=fLOG) data = download_data("added.zip", whereTo=folder, fLOG=fLOG) with open(data[0], "r") as f: text = f.read() added_edges = eval(text) path = euler_path(edges, added_edges) fLOG(len(path), len(edges) + len(added_edges)) for p in path[:5]: fLOG(len(p), p) for p in path[-5:]: fLOG(len(p), p)
def elections_vote_places_geo(source="xd", folder=".", fLOG=noLOG): """ Retrieves data vote places (bureaux de vote in French) with geocodes. @param source should be None unless you want to use the backup plan ("xd") @param folder where to download @param fLOG logging function @return list of dataframe """ if source is None: raise NotImplementedError("use source='xd'") url = source file = "bureauxvotegeo.zip" data = download_data(file, website=url, whereTo=folder, fLOG=fLOG) for d in data: if d.endswith(".txt"): df = pandas.read_csv(d, sep="\t", encoding="utf-8") return df raise DataNotAvailableError( "Unable to find any csv file in '{0}'".format(file))
def load_sentiment_dataset(cache="."): """ Retourne un ensemble de phrases en anglais avec assorties d'un sentiment positif ou négatif. Source : `Sentiment Labelled Sentences Data Set <https://archive.ics.uci.edu/ ml/datasets/Sentiment+Labelled+Sentences>`_. Notebooks associés à ce jeu de données : .. runpython:: :rst: from papierstat.datasets.documentation import list_notebooks_rst_links links = list_notebooks_rst_links('lectures', 'text_sentiment_wordvec') links = [' * %s' % s for s in links] print('\\n'.join(links)) @param cache where to cache or unzip the data if downloaded a second time @return text content (str) """ from pyensae.datasource import download_data # url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00331/" name = "sentiment_labelled_sentences.zip" res = download_data(name, whereTo=cache) if len(res) != 9: raise ValueError( # pragma: no cover "Unzipping '{0}' failed.".format(name)) dfs = [] for fi in res: if ".txt" not in fi or "readme" in fi or "__MACOSX" in fi: continue df = pandas.read_csv(fi, sep='\t', quoting=3, names=['sentence', 'sentiment']) df["source"] = os.path.splitext(os.path.split(fi)[-1])[0] dfs.append(df) return pandas.concat(dfs)
def test_euler(self): fLOG( __file__, self._testMethodName, OutputPrint=__name__ == "__main__") folder = os.path.join( os.path.abspath( os.path.dirname(__file__)), "temp_rues_euler") if not os.path.exists(folder): os.mkdir(folder) edges = get_data(whereTo=folder, fLOG=fLOG) data = download_data("added.zip", whereTo=folder, fLOG=fLOG) with open(data[0], "r") as f: text = f.read() added_edges = eval(text) path = euler_path(edges, added_edges) fLOG(len(path), len(edges) + len(added_edges)) for p in path[:5]: fLOG(len(p), p) for p in path[-5:]: fLOG(len(p), p)
def elections_legislatives_circonscription_geo(source="xd", folder=".", fLOG=noLOG): """ Retrieves data from `Countours des circonscriptions des législatives <https://www.data.gouv.fr/fr/ datasets/countours-des-circonscriptions-des-legislatives-nd/>`_. @param source should be None unless you want to use the backup plan ("xd") @param folder where to download @return list of dataframe """ if source is None: raise NotImplementedError( # pragma: no cover "use source='xd'") url = source file = "toxicode_circonscriptions_legislatives.zip" data = download_data(file, website=url, whereTo=folder, fLOG=fLOG) for d in data: if d.endswith(".csv"): df = pandas.read_csv(d, sep=",", encoding="utf-8") return df raise DataNotAvailableError( "unable to find any csv file in '{0}'".format(file))
#coding:latin-1 import sys, datetime sys.path.append("../../../../program/python/pyensae/src") from pyensae.datasource import download_data print("A", datetime.datetime.now()) download_data("SQLiteSpy.zip", website='xd') print("B", datetime.datetime.now()) download_data("td8_velib.zip", website='xd') print("C", datetime.datetime.now()) from pyensae import import_flatfile_into_database dbf = "td8_velib2.db3" if False: print("import", datetime.datetime.now()) import_flatfile_into_database(dbf, "td8_velib.txt") print("import", datetime.datetime.now()) import_flatfile_into_database(dbf, "stations.txt", table="stations") print("import", datetime.datetime.now()) if False: import sqlite3 conn = sqlite3.connect(dbf) data = conn.execute("SELECT * FROM stations") for d in data: print(d) conn.close()
def table_mortalite_euro_stat(url="http://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?file=data/", name="demo_mlifetable.tsv.gz", final_name="mortalite.txt", whereTo=".", stop_at=None, fLOG=noLOG): """ This function retrieves mortality table from `EuroStat <http://ec.europa.eu/eurostat/fr>`_ through `table de mortalité <http://www.data-publica.com/opendata/7098--population-et-conditions-sociales-table-de-mortalite-de-1960-a-2010>`_ (*this link is currently broken, data-publica does not provide such a database anymore, a copy is provided*). @param url data source @param name data table name @param final_name the data is compressed, it needs to be uncompressed into a file, this parameter defines its name @param whereTo data needs to be downloaded, location of this place @param stop_at the overall process is quite long, if not None, it only keeps the first rows @param fLOG logging function @return data_frame The function checks the file final_name exists. If it is the case, the data is not downloaded twice. The header contains a weird format as coordinates are separated by a comma:: indic_de,sex,age,geo\time 2013 2012 2011 2010 2009 We need to preprocess the data to split this information into columns. The overall process takes 4-5 minutes, 10 seconds to download (< 10 Mb), 4-5 minutes to preprocess the data (it could be improved). The processed data contains the following columns:: ['annee', 'valeur', 'age', 'age_num', 'indicateur', 'genre', 'pays'] Columns *age* and *age_num* look alike. *age_num* is numeric and is equal to *age* except when *age_num* is 85. Everybody above that age fall into the same category. The table contains many indicators: * PROBSURV: Probabilité de survie entre deux âges exacts (px) * LIFEXP: Esperance de vie à l'âge exact (ex) * SURVIVORS: Nombre des survivants à l'âge exact (lx) * PYLIVED: Nombre d'années personnes vécues entre deux âges exacts (Lx) * DEATHRATE: Taux de mortalité à l'âge x (Mx) * PROBDEATH: Probabilité de décès entre deux âges exacts (qx) * TOTPYLIVED: Nombre total d'années personne vécues après l'âge exact (Tx) """ if os.path.exists(final_name) and os.stat(final_name).st_size > 1e7: return final_name temp = final_name + ".remove.txt" if not os.path.exists(temp) or os.stat(temp).st_size < 1e7: local = download_data(name, url=url, whereTo=whereTo) local = local[0] + ".gz" with gzip.open(local, 'rb') as f: file_content = f.read() content = str(file_content, encoding="utf8") with open(temp, "w", encoding="utf8") as f: f.write(content) def format_age(s): if s.startswith("Y_"): if s.startswith("Y_LT"): return "YLT" + s[4:] elif s.startswith("Y_GE"): return "YGE" + s[4:] else: raise SyntaxError(s) else: i = int(s.strip("Y")) return "Y%02d" % i def format_age_num(s): if s.startswith("Y_"): if s.startswith("Y_LT"): return float(s.replace("Y_LT", "")) elif s.startswith("Y_GE"): return float(s.replace("Y_GE", "")) else: raise SyntaxError(s) else: i = int(s.strip("Y")) return float(i) def format_value(s): if s.strip() == ":": return numpy.nan else: return float(s.strip(" ebp")) fLOG("step 0, reading") dff = pandas.read_csv(temp, sep="\t", encoding="utf8") if stop_at is not None: fLOG("step 0, shortening") dfsmall = dff.head(n=stop_at) df = dfsmall else: df = dff fLOG("step 1, size=", df.shape) dfi = df.reset_index().set_index("indic_de,sex,age,geo\\time") dfi = dfi.drop('index', axis=1) dfs = dfi.stack() dfs = pandas.DataFrame({"valeur": dfs}) fLOG("step 2, size=", dfs.shape) dfs["valeur"] = dfs["valeur"].astype(str) dfs["valeur"] = dfs["valeur"].apply(format_value) dfs = dfs[dfs.valeur >= 0].copy() dfs = dfs.reset_index() dfs.columns = ["index", "annee", "valeur"] fLOG("step 3, size=", dfs.shape) dfs["age"] = dfs["index"].apply(lambda i: format_age(i.split(",")[2])) dfs["age_num"] = dfs["index"].apply( lambda i: format_age_num(i.split(",")[2])) dfs["indicateur"] = dfs["index"].apply(lambda i: i.split(",")[0]) dfs["genre"] = dfs["index"].apply(lambda i: i.split(",")[1]) dfs["pays"] = dfs["index"].apply(lambda i: i.split(",")[3]) fLOG("step 4") dfy = dfs.drop('index', axis=1) dfy.to_csv(final_name, sep="\t", encoding="utf8", index=False) return final_name
def elections_vote_place_address(folder=".", hide_warning=False, fLOG=noLOG): """ Scrapes and extracts addresses for every vote place (bureau de vote in French). @param folder where to download the scraped pages @param hide_warnings hide warnings @param fLOG logging function @return dictionary The function does not retrieve everything due to the irregular format. Sometimes, the city is missing or written above. """ _elections_vote_place_address_patterns = _elections_vote_place_address_patterns_( ) files = [] for deps in range(1, 96): last = "bureaudevote%02d.htm" % deps url = "http://bureaudevote.fr/" try: f = download_data(last, website=url, whereTo=folder, fLOG=fLOG) except (urllib.error.HTTPError, DownloadDataException): # pragma: no cover # backup plan files = download_data("bureauxdevote.zip", website="xd", whereTo=folder, fLOG=fLOG) break if isinstance(f, list): f = f[0] files.append(f) # extract data regex = [re.compile(_) for _ in _elections_vote_place_address_patterns] rows = [] exc = [] for data in files: lrows = [] with open(data, "r", encoding="iso-8859-1") as f: content = f.read().lower() content = html_to_text(content) content0 = content content = content.replace("\n", " ").replace("\t", " ") atous = [] for reg in regex: atous.extend(reg.findall(content)) if len(atous) < 4 and len(atous) > 0: mes = "Not enough vote places ({2}) in\n{0}\nFOUND\n{3}\nCONTENT\n{1}".format( data, content0, len(atous), "\n".join(str(_) for _ in atous)) exc.append(Exception(mes)) if len(atous) > 1: for t in atous: ad = t[-3].split("-") address = ad[-1].strip(" ./<>-") place = "-".join(ad[:-1]).strip(" ./<>-") if "bureau de vote" in place: if not hide_warning: warnings.warn("Too long address {0}".format(t)) else: try: lrows.append( dict(n=int(t[1]), city=t[-1].strip(" .<>/"), zip=t[-2], address=address, place=place)) except ValueError as e: # pragma: no cover raise DataFormatException( "issue with {0}".format(t)) from e if len(lrows[-1]["city"]) <= 1: mes = "No City in {0}\nROWS\n{2}\nCONTENT\n{1}".format( t, content0, "\n".join(str(_) for _ in lrows)) # pragma: no cover raise DataFormatException(mes) # pragma: no cover if lrows: rows.extend(lrows) elif "06.htm" in data: mes = "Not enough vote places ({2}) in\n{0}\nFOUND\n{3}\nCONTENT\n{1}".format( data, content0, len(lrows), "\n".join(str(_) for _ in lrows)) # pragma: no cover raise DataFormatException(mes) # pragma: no cover if len(exc) > 2: mes = "Exception raised: {0}\n---------\n{1}".format( # pragma: no cover len(exc), "\n########################\n".join(str(_) for _ in exc)) raise DataFormatException(mes) # pragma: no cover return pandas.DataFrame(rows)
def test_matplotlib_example(self): fLOG( __file__, self._testMethodName, OutputPrint=__name__ == "__main__") progs = ["ffmpeg"] if not sys.platform.startswith("win"): progs.append("avconv") errs = [] prog = None for prog in progs: out, err = run_cmd(prog, wait=True, fLOG=fLOG) exps = "usage:" if (exps not in out and exps not in err) or err is None or len(err) == 0: errs.append((prog, err)) else: break if len(errs) >= len(progs): if sys.platform.startswith("win"): fLOG("download ffmpeg") add_missing_development_version( ["pyensae"], __file__, hide=True) from pyensae.datasource import download_data download_data("ffmpeg.zip", website="xd") else: raise FileNotFoundError( "Unable to find '{1}'.\nPATH='{0}'\n--------\n[OUT]\n{2}\n[ERR]\n{3}".format( os.environ["PATH"], prog, out, "\n----\n".join("{0}:\n{1}".format(*_) for _ in errs))) temp = get_temp_folder(__file__, "temp_example_example") fix_tkinter_issues_virtualenv() # update a distribution based on new data. import numpy as np import matplotlib.pyplot as plt import scipy.stats as ss from matplotlib.animation import FuncAnimation, writers # To get the list of available writers if not writers.is_available(prog): writers.register(prog) fLOG(writers.list()) class UpdateDist: def __init__(self, ax, prob=0.5): self.success = 0 self.prob = prob self.line, = ax.plot([], [], 'k-') self.x = np.linspace(0, 1, 200) self.ax = ax # Set up plot parameters self.ax.set_xlim(0, 1) self.ax.set_ylim(0, 15) self.ax.grid(True) # This vertical line represents the theoretical value, to # which the plotted distribution should converge. self.ax.axvline(prob, linestyle='--', color='black') def init(self): self.success = 0 self.line.set_data([], []) return self.line, def __call__(self, i): # This way the plot can continuously run and we just keep # watching new realizations of the process if i == 0: return self.init() # Choose success based on exceed a threshold with a uniform # pick if np.random.rand(1,) < self.prob: # pylint: disable=W0143 self.success += 1 y = ss.beta.pdf(self.x, self.success + 1, (i - self.success) + 1) self.line.set_data(self.x, y) return self.line, fig = plt.figure() ax = fig.add_subplot(1, 1, 1) ud = UpdateDist(ax, prob=0.7) anim = FuncAnimation(fig, ud, frames=np.arange(100), init_func=ud.init, interval=100, blit=True) try: Writer = writers[prog] except KeyError as e: if prog == "avconv": from matplotlib.animation import AVConvWriter Writer = AVConvWriter else: raise e writer = Writer(fps=15, metadata=dict(artist='Me'), bitrate=1800) anim.save(os.path.join(temp, 'lines2.mp4'), writer=writer) plt.close('all') fLOG("end")
#coding:latin-1 import sys sys.path.append("../../../../program/python/pyensae/src") # ligne inutile from pyensae.datasource import download_data import pandas download_data("td9_data.zip", website = 'xd') file1 = "td9_full.txt" tbl = pandas.read_csv (file1, sep = "\t") from pandas.tools.plotting import scatter_plot gr = tbl.groupby(['lng','lat'], as_index = False).agg(lambda x: len(x)) # voir http://dev.openlayers.org/docs/files/OpenLayers/Marker-js.html pour changer le marker html = """ <html><body> <div id="mapdiv"></div> <script src="http://www.openlayers.org/api/OpenLayers.js"></script> <script> map = new OpenLayers.Map("mapdiv"); map.addLayer(new OpenLayers.Layer.OSM()); var proj = new OpenLayers.Projection("EPSG:4326"); var zoom=13; var markers = new OpenLayers.Layer.Markers( "Markers" ); map.addLayer(markers); __VELIB__