def __init__(self, dataset): """ :param Datasets dataset: Datasets instance """ super().__init__(dataset) self.store_path = self.get_store_path() if not "series_last_update" in self.dataset.metadata: self.dataset.metadata["series_last_update"] = {} #TODO: prendre cette info dans la DSD sans utiliser dataflows self.dataset.name = self.fetcher._dataflows[self.dataset_code]["name"] self.dsd_id = self.fetcher._dataflows[self.dataset_code]["dsd_id"] self.xml_dsd = XMLStructure(provider_name=self.provider_name, sdmx_client=self.fetcher.xml_sdmx) self.xml_dsd.concepts = self.fetcher._concepts self.xml_dsd.codelists = self.fetcher._codelists self._load_dsd() self.xml_data = XMLData(provider_name=self.provider_name, dataset_code=self.dataset_code, xml_dsd=self.xml_dsd, dsd_id=self.dsd_id, frequencies_supported=FREQUENCIES_SUPPORTED) self.rows = self._get_data_by_dimension()
def __init__(self, dataset, dataset_doc=None): """ :param Datasets dataset: Datasets instance """ super().__init__(dataset) self.dataset_doc = dataset_doc self.store_path = self.get_store_path() #TODO: prendre cette info dans la DSD sans utiliser dataflows self.dataset.name = self.fetcher._dataflows[self.dataset_code]["name"] self.dsd_id = self.fetcher._dataflows[self.dataset_code]["dsd_id"] if self.dataset_doc and self.dataset_doc["enable"]: #self.last_update = self.dataset_doc["last_update"] self.last_update = self.dataset_doc["download_last"] else: self.last_update = self.dataset.download_last #self.dataset.last_update self.xml_dsd = XMLStructure(provider_name=self.provider_name, sdmx_client=self.fetcher.xml_sdmx) self.xml_dsd.concepts = self.fetcher._concepts self.xml_dsd.codelists = self.fetcher._codelists self._load_dsd() self.xml_data = XMLData(provider_name=self.provider_name, dataset_code=self.dataset_code, xml_dsd=self.xml_dsd, frequencies_supported=FREQUENCIES_SUPPORTED) self.rows = self._get_data_by_dimension()
def _load(self): self.dsd_id = self.dataset_code url = "http://www.bdm.insee.fr/series/sdmx/datastructure/INSEE/%s?references=children" % self.dsd_id download = Downloader(url=url, filename="dsd-%s.xml" % self.dataset_code, headers=SDMX_METADATA_HEADERS) self.xml_dsd.process(download.get_filepath()) self.dataset.name = self.xml_dsd.dataset_name dimensions = OrderedDict() for key, item in self.xml_dsd.dimensions.items(): dimensions[key] = item["dimensions"] self.dimension_list.set_dict(dimensions) attributes = OrderedDict() for key, item in self.xml_dsd.attributes.items(): attributes[key] = item["values"] self.attribute_list.set_dict(attributes) url = "http://www.bdm.insee.fr/series/sdmx/data/%s" % self.dataset_code download = Downloader(url=url, filename="data-%s.xml" % self.dataset_code, headers=SDMX_DATA_HEADERS) self.xml_data = XMLData(provider_name=self.provider_name, dataset_code=self.dataset_code, dimension_keys=self.xml_dsd.dimension_keys) #TODO: response and exception try: filepath, response = download.get_filepath_and_response() except requests.exceptions.HTTPError as err: logger.critical("AUTRE ERREUR HTTP : %s" % err.response.status_code) raise self.rows = self.xml_data.process(filepath)
class INSEE_Data(SeriesIterator): def __init__(self, dataset): """ :param Datasets dataset: Datasets instance """ super().__init__(dataset) self.store_path = self.get_store_path() if not "series_last_update" in self.dataset.metadata: self.dataset.metadata["series_last_update"] = {} #TODO: prendre cette info dans la DSD sans utiliser dataflows self.dataset.name = self.fetcher._dataflows[self.dataset_code]["name"] self.dsd_id = self.fetcher._dataflows[self.dataset_code]["dsd_id"] self.xml_dsd = XMLStructure(provider_name=self.provider_name, sdmx_client=self.fetcher.xml_sdmx) self.xml_dsd.concepts = self.fetcher._concepts self.xml_dsd.codelists = self.fetcher._codelists self._load_dsd() self.xml_data = XMLData(provider_name=self.provider_name, dataset_code=self.dataset_code, xml_dsd=self.xml_dsd, dsd_id=self.dsd_id, frequencies_supported=FREQUENCIES_SUPPORTED) self.rows = self._get_data_by_dimension() def _load_dsd_by_element(self): #FIXME: Manque codelist et concepts ? url = "http://www.bdm.insee.fr/series/sdmx/datastructure/INSEE/%s" % self.dsd_id download = Downloader(url=url, filename="datastructure-%s.xml" % self.dsd_id, headers=SDMX_METADATA_HEADERS, store_filepath=self.store_path, use_existing_file=self.fetcher.use_existing_file, client=self.fetcher.requests_client) filepath = download.get_filepath() self.fetcher.for_delete.append(filepath) self.xml_dsd.process(filepath) self._set_dataset() def _load_dsd(self): """ #TODO: il y a une DSD pour chaque groupe de séries (soit environ 400), - download 1 dsd partage par plusieurs dataset - 668 datase """ url = "http://www.bdm.insee.fr/series/sdmx/datastructure/INSEE/%s?references=children" % self.dsd_id download = Downloader(url=url, filename="dsd-%s.xml" % self.dsd_id, headers=SDMX_METADATA_HEADERS, store_filepath=self.store_path, use_existing_file=self.fetcher.use_existing_file, client=self.fetcher.requests_client) filepath, response = download.get_filepath_and_response() if response: if response.status_code == HTTP_ERROR_LONG_RESPONSE: self._load_dsd_by_element() return elif response.status_code >= 400: raise response.raise_for_status() if not os.path.exists(filepath): self._load_dsd_by_element() return self.fetcher.for_delete.append(filepath) self.xml_dsd.process(filepath) self._set_dataset() def _set_dataset(self): dataset = dataset_converter(self.xml_dsd, self.dataset_code, dsd_id=self.dsd_id) self.dataset.dimension_keys = dataset["dimension_keys"] self.dataset.attribute_keys = dataset["attribute_keys"] self.dataset.concepts = dataset["concepts"] self.dataset.codelists = dataset["codelists"] def _get_dimensions_from_dsd(self): return get_dimensions_from_dsd(self.xml_dsd, self.provider_name, self.dataset_code) def _get_data_by_dimension(self): dimension_keys, dimensions = self._get_dimensions_from_dsd() choice = "avg" if self.dataset_code in ["IPC-2015-COICOP"]: choice = "max" position, _key, dimension_values = select_dimension(dimension_keys, dimensions, choice=choice) count_dimensions = len(dimension_keys) logger.info( "choice[%s] - filterkey[%s] - count[%s] - provider[%s] - dataset[%s]" % (choice, _key, len(dimension_values), self.provider_name, self.dataset_code)) for dimension_value in dimension_values: '''Pour chaque valeur de la dimension, generer une key d'url''' key = get_key_for_dimension(count_dimensions, position, dimension_value) url = "http://www.bdm.insee.fr/series/sdmx/data/%s/%s" % ( self.dataset_code, key) if self._is_good_url(url) is False: logger.warning("bypass not good url[%s]" % url) continue filename = "data-%s-%s.xml" % (self.dataset_code, key.replace(".", "_")) download = Downloader( url=url, filename=filename, store_filepath=self.store_path, use_existing_file=self.fetcher.use_existing_file, #NOT USE FOR INSEE client=self.fetcher.requests_client ) filepath, response = download.get_filepath_and_response() if not response is None: self._add_url_cache(url, response.status_code) if filepath and os.path.exists(filepath): self.fetcher.for_delete.append(filepath) elif not filepath or not os.path.exists(filepath): continue if response and response.status_code == HTTP_ERROR_NO_RESULT: continue elif response and response.status_code >= 400: raise response.raise_for_status() for row, err in self.xml_data.process(filepath): yield row, err #self.dataset.update_database(save_only=True) yield None, None def _is_updated(self, bson): """Verify if series changes Return True si la series doit etre mise a jour et False si elle est a jour """ if not bson["key"] in self.dataset.metadata["series_last_update"]: self.dataset.metadata["series_last_update"][ bson["key"]] = bson.get('last_update') return True last_update = self.dataset.metadata["series_last_update"][bson["key"]] series_updated = bson.get('last_update') if not series_updated: return True if series_updated > last_update: return True return False def clean_field(self, bson): bson["attributes"].pop("IDBANK", None) bson = super().clean_field(bson) return bson def build_series(self, bson): self.dataset.add_frequency(bson["frequency"]) if not self._is_updated(bson): raise errors.RejectUpdatedSeries(provider_name=self.provider_name, dataset_code=self.dataset_code, key=bson.get('key')) return bson
class INSEE_Data(object): def __init__(self, dataset=None, dataset_doc=None): """ :param Datasets dataset: Datasets instance """ self.dataset = dataset self.dataset_doc = dataset_doc self.attribute_list = self.dataset.attribute_list self.dimension_list = self.dataset.dimension_list self.provider_name = self.dataset.provider_name self.dataset_code = self.dataset.dataset_code if self.dataset_doc: self.last_update = self.dataset_doc["last_update"] self.xml_dsd = XMLStructure_2_1(provider_name=self.provider_name, dataset_code=self.dataset_code) self.rows = None self._load() def _load(self): self.dsd_id = self.dataset_code url = "http://www.bdm.insee.fr/series/sdmx/datastructure/INSEE/%s?references=children" % self.dsd_id download = Downloader(url=url, filename="dsd-%s.xml" % self.dataset_code, headers=SDMX_METADATA_HEADERS) self.xml_dsd.process(download.get_filepath()) self.dataset.name = self.xml_dsd.dataset_name dimensions = OrderedDict() for key, item in self.xml_dsd.dimensions.items(): dimensions[key] = item["dimensions"] self.dimension_list.set_dict(dimensions) attributes = OrderedDict() for key, item in self.xml_dsd.attributes.items(): attributes[key] = item["values"] self.attribute_list.set_dict(attributes) url = "http://www.bdm.insee.fr/series/sdmx/data/%s" % self.dataset_code download = Downloader(url=url, filename="data-%s.xml" % self.dataset_code, headers=SDMX_DATA_HEADERS) self.xml_data = XMLData(provider_name=self.provider_name, dataset_code=self.dataset_code, dimension_keys=self.xml_dsd.dimension_keys) #TODO: response and exception try: filepath, response = download.get_filepath_and_response() except requests.exceptions.HTTPError as err: logger.critical("AUTRE ERREUR HTTP : %s" % err.response.status_code) raise self.rows = self.xml_data.process(filepath) def __next__(self): _series = next(self.rows) if not _series: raise StopIteration() return self.build_series(_series) def is_updated(self, bson): """Verify if series changes """ if not self.last_update: return True series_updated = bson['last_update'] _is_updated = series_updated > self.last_update if not _is_updated and logger.isEnabledFor(logging.INFO): logger.info("bypass updated dataset_code[%s][%s] - idbank[%s][%s]" % (self.dataset_code, self.last_update, bson['key'], series_updated)) return _is_updated def build_series(self, bson): #TODO: last_update : update dataset ? #bson["last_update"] = self.last_update return bson
class INSEE_Data(SeriesIterator): def __init__(self, dataset): """ :param Datasets dataset: Datasets instance """ super().__init__(dataset) self.store_path = self.get_store_path() if not "series_last_update" in self.dataset.metadata: self.dataset.metadata["series_last_update"] = {} #TODO: prendre cette info dans la DSD sans utiliser dataflows self.dataset.name = self.fetcher._dataflows[self.dataset_code]["name"] self.dsd_id = self.fetcher._dataflows[self.dataset_code]["dsd_id"] self.xml_dsd = XMLStructure(provider_name=self.provider_name, sdmx_client=self.fetcher.xml_sdmx) self.xml_dsd.concepts = self.fetcher._concepts self.xml_dsd.codelists = self.fetcher._codelists self._load_dsd() self.xml_data = XMLData(provider_name=self.provider_name, dataset_code=self.dataset_code, xml_dsd=self.xml_dsd, dsd_id=self.dsd_id, frequencies_supported=FREQUENCIES_SUPPORTED) self.rows = self._get_data_by_dimension() def _load_dsd_by_element(self): #FIXME: Manque codelist et concepts ? url = "http://www.bdm.insee.fr/series/sdmx/datastructure/INSEE/%s" % self.dsd_id download = Downloader(url=url, filename="datastructure-%s.xml" % self.dsd_id, headers=SDMX_METADATA_HEADERS, store_filepath=self.store_path, use_existing_file=self.fetcher.use_existing_file, client=self.fetcher.requests_client) filepath = download.get_filepath() self.fetcher.for_delete.append(filepath) self.xml_dsd.process(filepath) self._set_dataset() def _load_dsd(self): """ #TODO: il y a une DSD pour chaque groupe de séries (soit environ 400), - download 1 dsd partage par plusieurs dataset - 668 datase """ url = "http://www.bdm.insee.fr/series/sdmx/datastructure/INSEE/%s?references=children" % self.dsd_id download = Downloader(url=url, filename="dsd-%s.xml" % self.dsd_id, headers=SDMX_METADATA_HEADERS, store_filepath=self.store_path, use_existing_file=self.fetcher.use_existing_file, client=self.fetcher.requests_client) filepath, response = download.get_filepath_and_response() if response: if response.status_code == HTTP_ERROR_LONG_RESPONSE: self._load_dsd_by_element() return elif response.status_code >= 400: raise response.raise_for_status() if not os.path.exists(filepath): self._load_dsd_by_element() return self.fetcher.for_delete.append(filepath) self.xml_dsd.process(filepath) self._set_dataset() def _set_dataset(self): dataset = dataset_converter(self.xml_dsd, self.dataset_code, dsd_id=self.dsd_id) self.dataset.dimension_keys = dataset["dimension_keys"] self.dataset.attribute_keys = dataset["attribute_keys"] self.dataset.concepts = dataset["concepts"] self.dataset.codelists = dataset["codelists"] def _get_dimensions_from_dsd(self): return get_dimensions_from_dsd(self.xml_dsd, self.provider_name, self.dataset_code) def _get_data_by_dimension(self): dimension_keys, dimensions = self._get_dimensions_from_dsd() choice = "avg" if self.dataset_code in ["IPC-2015-COICOP"]: choice = "max" position, _key, dimension_values = select_dimension(dimension_keys, dimensions, choice=choice) count_dimensions = len(dimension_keys) logger.info("choice[%s] - filterkey[%s] - count[%s] - provider[%s] - dataset[%s]" % (choice, _key, len(dimension_values), self.provider_name, self.dataset_code)) for dimension_value in dimension_values: '''Pour chaque valeur de la dimension, generer une key d'url''' key = get_key_for_dimension(count_dimensions, position, dimension_value) url = "http://www.bdm.insee.fr/series/sdmx/data/%s/%s" % (self.dataset_code, key) if self._is_good_url(url) is False: logger.warning("bypass not good url[%s]" % url) continue filename = "data-%s-%s.xml" % (self.dataset_code, key.replace(".", "_")) download = Downloader(url=url, filename=filename, store_filepath=self.store_path, use_existing_file=self.fetcher.use_existing_file, #NOT USE FOR INSEE client=self.fetcher.requests_client ) filepath, response = download.get_filepath_and_response() if not response is None: self._add_url_cache(url, response.status_code) if filepath and os.path.exists(filepath): self.fetcher.for_delete.append(filepath) elif not filepath or not os.path.exists(filepath): continue if response and response.status_code == HTTP_ERROR_NO_RESULT: continue elif response and response.status_code >= 400: raise response.raise_for_status() for row, err in self.xml_data.process(filepath): yield row, err #self.dataset.update_database(save_only=True) yield None, None def _is_updated(self, bson): """Verify if series changes Return True si la series doit etre mise a jour et False si elle est a jour """ if not bson["key"] in self.dataset.metadata["series_last_update"]: self.dataset.metadata["series_last_update"][bson["key"]] = bson.get('last_update') return True last_update = self.dataset.metadata["series_last_update"][bson["key"]] series_updated = bson.get('last_update') if not series_updated: return True if series_updated > last_update: return True return False def clean_field(self, bson): bson["attributes"].pop("IDBANK", None) bson = super().clean_field(bson) return bson def build_series(self, bson): self.dataset.add_frequency(bson["frequency"]) if not self._is_updated(bson): raise errors.RejectUpdatedSeries(provider_name=self.provider_name, dataset_code=self.dataset_code, key=bson.get('key')) return bson