示例#1
0
文件: insee.py 项目: Widukind/dlstats
    def __init__(self, dataset):
        """
        :param Datasets dataset: Datasets instance
        """
        super().__init__(dataset)

        self.store_path = self.get_store_path()

        if not "series_last_update" in self.dataset.metadata:
            self.dataset.metadata["series_last_update"] = {}

        #TODO: prendre cette info dans la DSD sans utiliser dataflows
        self.dataset.name = self.fetcher._dataflows[self.dataset_code]["name"]
        self.dsd_id = self.fetcher._dataflows[self.dataset_code]["dsd_id"]

        self.xml_dsd = XMLStructure(provider_name=self.provider_name,
                                    sdmx_client=self.fetcher.xml_sdmx)
        self.xml_dsd.concepts = self.fetcher._concepts
        self.xml_dsd.codelists = self.fetcher._codelists

        self._load_dsd()

        self.xml_data = XMLData(provider_name=self.provider_name,
                                dataset_code=self.dataset_code,
                                xml_dsd=self.xml_dsd,
                                dsd_id=self.dsd_id,
                                frequencies_supported=FREQUENCIES_SUPPORTED)

        self.rows = self._get_data_by_dimension()
示例#2
0
    def __init__(self, dataset):
        """
        :param Datasets dataset: Datasets instance
        """
        super().__init__(dataset)

        self.store_path = self.get_store_path()

        if not "series_last_update" in self.dataset.metadata:
            self.dataset.metadata["series_last_update"] = {}

        #TODO: prendre cette info dans la DSD sans utiliser dataflows
        self.dataset.name = self.fetcher._dataflows[self.dataset_code]["name"]
        self.dsd_id = self.fetcher._dataflows[self.dataset_code]["dsd_id"]

        self.xml_dsd = XMLStructure(provider_name=self.provider_name,
                                    sdmx_client=self.fetcher.xml_sdmx)
        self.xml_dsd.concepts = self.fetcher._concepts
        self.xml_dsd.codelists = self.fetcher._codelists

        self._load_dsd()

        self.xml_data = XMLData(provider_name=self.provider_name,
                                dataset_code=self.dataset_code,
                                xml_dsd=self.xml_dsd,
                                dsd_id=self.dsd_id,
                                frequencies_supported=FREQUENCIES_SUPPORTED)

        self.rows = self._get_data_by_dimension()
示例#3
0
    def __init__(self, dataset, dataset_doc=None):
        """
        :param Datasets dataset: Datasets instance
        """
        super().__init__(dataset)

        self.dataset_doc = dataset_doc
        self.store_path = self.get_store_path()
        
        #TODO: prendre cette info dans la DSD sans utiliser dataflows
        self.dataset.name = self.fetcher._dataflows[self.dataset_code]["name"]        
        self.dsd_id = self.fetcher._dataflows[self.dataset_code]["dsd_id"]
        
        if self.dataset_doc and self.dataset_doc["enable"]:
            #self.last_update = self.dataset_doc["last_update"]
            self.last_update = self.dataset_doc["download_last"]
        else:
            self.last_update = self.dataset.download_last #self.dataset.last_update

        self.xml_dsd = XMLStructure(provider_name=self.provider_name,
                                    sdmx_client=self.fetcher.xml_sdmx)        
        self.xml_dsd.concepts = self.fetcher._concepts
        self.xml_dsd.codelists = self.fetcher._codelists

        self._load_dsd()
        
        self.xml_data = XMLData(provider_name=self.provider_name,
                                dataset_code=self.dataset_code,
                                xml_dsd=self.xml_dsd,
                                frequencies_supported=FREQUENCIES_SUPPORTED)
        
        self.rows = self._get_data_by_dimension()
示例#4
0
    def _load(self):

        self.dsd_id = self.dataset_code

        url = "http://www.bdm.insee.fr/series/sdmx/datastructure/INSEE/%s?references=children" % self.dsd_id
        download = Downloader(url=url, 
                              filename="dsd-%s.xml" % self.dataset_code,
                              headers=SDMX_METADATA_HEADERS)
        self.xml_dsd.process(download.get_filepath())
        
        self.dataset.name = self.xml_dsd.dataset_name
        
        dimensions = OrderedDict()
        for key, item in self.xml_dsd.dimensions.items():
            dimensions[key] = item["dimensions"]
        self.dimension_list.set_dict(dimensions)
        
        attributes = OrderedDict()
        for key, item in self.xml_dsd.attributes.items():
            attributes[key] = item["values"]
        self.attribute_list.set_dict(attributes)
        
        url = "http://www.bdm.insee.fr/series/sdmx/data/%s" % self.dataset_code
        download = Downloader(url=url, 
                              filename="data-%s.xml" % self.dataset_code,
                              headers=SDMX_DATA_HEADERS)

        self.xml_data = XMLData(provider_name=self.provider_name,
                                dataset_code=self.dataset_code,
                                dimension_keys=self.xml_dsd.dimension_keys)
        
        #TODO: response and exception
        try:
            filepath, response = download.get_filepath_and_response()        
        except requests.exceptions.HTTPError as err:
            logger.critical("AUTRE ERREUR HTTP : %s" % err.response.status_code)
            raise
            
        self.rows = self.xml_data.process(filepath)
示例#5
0
class INSEE_Data(SeriesIterator):
    def __init__(self, dataset):
        """
        :param Datasets dataset: Datasets instance
        """
        super().__init__(dataset)

        self.store_path = self.get_store_path()

        if not "series_last_update" in self.dataset.metadata:
            self.dataset.metadata["series_last_update"] = {}

        #TODO: prendre cette info dans la DSD sans utiliser dataflows
        self.dataset.name = self.fetcher._dataflows[self.dataset_code]["name"]
        self.dsd_id = self.fetcher._dataflows[self.dataset_code]["dsd_id"]

        self.xml_dsd = XMLStructure(provider_name=self.provider_name,
                                    sdmx_client=self.fetcher.xml_sdmx)
        self.xml_dsd.concepts = self.fetcher._concepts
        self.xml_dsd.codelists = self.fetcher._codelists

        self._load_dsd()

        self.xml_data = XMLData(provider_name=self.provider_name,
                                dataset_code=self.dataset_code,
                                xml_dsd=self.xml_dsd,
                                dsd_id=self.dsd_id,
                                frequencies_supported=FREQUENCIES_SUPPORTED)

        self.rows = self._get_data_by_dimension()

    def _load_dsd_by_element(self):

        #FIXME: Manque codelist et concepts ?

        url = "http://www.bdm.insee.fr/series/sdmx/datastructure/INSEE/%s" % self.dsd_id
        download = Downloader(url=url,
                              filename="datastructure-%s.xml" % self.dsd_id,
                              headers=SDMX_METADATA_HEADERS,
                              store_filepath=self.store_path,
                              use_existing_file=self.fetcher.use_existing_file,
                              client=self.fetcher.requests_client)
        filepath = download.get_filepath()
        self.fetcher.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._set_dataset()

    def _load_dsd(self):
        """
        #TODO: il y a une DSD pour chaque groupe de séries (soit environ 400),
        - download 1 dsd partage par plusieurs dataset
        - 668 datase
        """

        url = "http://www.bdm.insee.fr/series/sdmx/datastructure/INSEE/%s?references=children" % self.dsd_id
        download = Downloader(url=url,
                              filename="dsd-%s.xml" % self.dsd_id,
                              headers=SDMX_METADATA_HEADERS,
                              store_filepath=self.store_path,
                              use_existing_file=self.fetcher.use_existing_file,
                              client=self.fetcher.requests_client)

        filepath, response = download.get_filepath_and_response()

        if response:
            if response.status_code == HTTP_ERROR_LONG_RESPONSE:
                self._load_dsd_by_element()
                return
            elif response.status_code >= 400:
                raise response.raise_for_status()

        if not os.path.exists(filepath):
            self._load_dsd_by_element()
            return

        self.fetcher.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._set_dataset()

    def _set_dataset(self):

        dataset = dataset_converter(self.xml_dsd,
                                    self.dataset_code,
                                    dsd_id=self.dsd_id)
        self.dataset.dimension_keys = dataset["dimension_keys"]
        self.dataset.attribute_keys = dataset["attribute_keys"]
        self.dataset.concepts = dataset["concepts"]
        self.dataset.codelists = dataset["codelists"]

    def _get_dimensions_from_dsd(self):
        return get_dimensions_from_dsd(self.xml_dsd, self.provider_name,
                                       self.dataset_code)

    def _get_data_by_dimension(self):

        dimension_keys, dimensions = self._get_dimensions_from_dsd()

        choice = "avg"
        if self.dataset_code in ["IPC-2015-COICOP"]:
            choice = "max"

        position, _key, dimension_values = select_dimension(dimension_keys,
                                                            dimensions,
                                                            choice=choice)

        count_dimensions = len(dimension_keys)

        logger.info(
            "choice[%s] - filterkey[%s] - count[%s] - provider[%s] - dataset[%s]"
            % (choice, _key, len(dimension_values), self.provider_name,
               self.dataset_code))

        for dimension_value in dimension_values:
            '''Pour chaque valeur de la dimension, generer une key d'url'''

            key = get_key_for_dimension(count_dimensions, position,
                                        dimension_value)

            url = "http://www.bdm.insee.fr/series/sdmx/data/%s/%s" % (
                self.dataset_code, key)
            if self._is_good_url(url) is False:
                logger.warning("bypass not good url[%s]" % url)
                continue

            filename = "data-%s-%s.xml" % (self.dataset_code,
                                           key.replace(".", "_"))
            download = Downloader(
                url=url,
                filename=filename,
                store_filepath=self.store_path,
                use_existing_file=self.fetcher.use_existing_file,
                #NOT USE FOR INSEE client=self.fetcher.requests_client
            )
            filepath, response = download.get_filepath_and_response()

            if not response is None:
                self._add_url_cache(url, response.status_code)

            if filepath and os.path.exists(filepath):
                self.fetcher.for_delete.append(filepath)
            elif not filepath or not os.path.exists(filepath):
                continue

            if response and response.status_code == HTTP_ERROR_NO_RESULT:
                continue
            elif response and response.status_code >= 400:
                raise response.raise_for_status()

            for row, err in self.xml_data.process(filepath):
                yield row, err

            #self.dataset.update_database(save_only=True)

        yield None, None

    def _is_updated(self, bson):
        """Verify if series changes

        Return True si la series doit etre mise a jour et False si elle est a jour
        """
        if not bson["key"] in self.dataset.metadata["series_last_update"]:
            self.dataset.metadata["series_last_update"][
                bson["key"]] = bson.get('last_update')
            return True

        last_update = self.dataset.metadata["series_last_update"][bson["key"]]

        series_updated = bson.get('last_update')
        if not series_updated:
            return True

        if series_updated > last_update:
            return True

        return False

    def clean_field(self, bson):
        bson["attributes"].pop("IDBANK", None)
        bson = super().clean_field(bson)
        return bson

    def build_series(self, bson):
        self.dataset.add_frequency(bson["frequency"])

        if not self._is_updated(bson):
            raise errors.RejectUpdatedSeries(provider_name=self.provider_name,
                                             dataset_code=self.dataset_code,
                                             key=bson.get('key'))

        return bson
示例#6
0
class INSEE_Data(object):
    
    def __init__(self, dataset=None, dataset_doc=None):
        """
        :param Datasets dataset: Datasets instance
        """
        self.dataset = dataset
        self.dataset_doc = dataset_doc
        self.attribute_list = self.dataset.attribute_list
        self.dimension_list = self.dataset.dimension_list
        self.provider_name = self.dataset.provider_name
        self.dataset_code = self.dataset.dataset_code

        if self.dataset_doc:
            self.last_update = self.dataset_doc["last_update"]

        self.xml_dsd = XMLStructure_2_1(provider_name=self.provider_name, 
                                        dataset_code=self.dataset_code)        
        
        self.rows = None
        self._load()
        
    def _load(self):

        self.dsd_id = self.dataset_code

        url = "http://www.bdm.insee.fr/series/sdmx/datastructure/INSEE/%s?references=children" % self.dsd_id
        download = Downloader(url=url, 
                              filename="dsd-%s.xml" % self.dataset_code,
                              headers=SDMX_METADATA_HEADERS)
        self.xml_dsd.process(download.get_filepath())
        
        self.dataset.name = self.xml_dsd.dataset_name
        
        dimensions = OrderedDict()
        for key, item in self.xml_dsd.dimensions.items():
            dimensions[key] = item["dimensions"]
        self.dimension_list.set_dict(dimensions)
        
        attributes = OrderedDict()
        for key, item in self.xml_dsd.attributes.items():
            attributes[key] = item["values"]
        self.attribute_list.set_dict(attributes)
        
        url = "http://www.bdm.insee.fr/series/sdmx/data/%s" % self.dataset_code
        download = Downloader(url=url, 
                              filename="data-%s.xml" % self.dataset_code,
                              headers=SDMX_DATA_HEADERS)

        self.xml_data = XMLData(provider_name=self.provider_name,
                                dataset_code=self.dataset_code,
                                dimension_keys=self.xml_dsd.dimension_keys)
        
        #TODO: response and exception
        try:
            filepath, response = download.get_filepath_and_response()        
        except requests.exceptions.HTTPError as err:
            logger.critical("AUTRE ERREUR HTTP : %s" % err.response.status_code)
            raise
            
        self.rows = self.xml_data.process(filepath)


    def __next__(self):
        _series = next(self.rows)
        
        if not _series:
            raise StopIteration()
        
        return self.build_series(_series)

    def is_updated(self, bson):
        """Verify if series changes
        """
        if not self.last_update:
            return True
        
        series_updated = bson['last_update']
        _is_updated = series_updated > self.last_update

        if not _is_updated and logger.isEnabledFor(logging.INFO):
            logger.info("bypass updated dataset_code[%s][%s] - idbank[%s][%s]" % (self.dataset_code,
                                                                                 self.last_update, 
                                                                                 bson['key'],
                                                                                 series_updated))
        
        return _is_updated

    def build_series(self, bson):
        #TODO: last_update : update dataset ?
        #bson["last_update"] = self.last_update
        return bson
示例#7
0
文件: insee.py 项目: Widukind/dlstats
class INSEE_Data(SeriesIterator):

    def __init__(self, dataset):
        """
        :param Datasets dataset: Datasets instance
        """
        super().__init__(dataset)

        self.store_path = self.get_store_path()

        if not "series_last_update" in self.dataset.metadata:
            self.dataset.metadata["series_last_update"] = {}

        #TODO: prendre cette info dans la DSD sans utiliser dataflows
        self.dataset.name = self.fetcher._dataflows[self.dataset_code]["name"]
        self.dsd_id = self.fetcher._dataflows[self.dataset_code]["dsd_id"]

        self.xml_dsd = XMLStructure(provider_name=self.provider_name,
                                    sdmx_client=self.fetcher.xml_sdmx)
        self.xml_dsd.concepts = self.fetcher._concepts
        self.xml_dsd.codelists = self.fetcher._codelists

        self._load_dsd()

        self.xml_data = XMLData(provider_name=self.provider_name,
                                dataset_code=self.dataset_code,
                                xml_dsd=self.xml_dsd,
                                dsd_id=self.dsd_id,
                                frequencies_supported=FREQUENCIES_SUPPORTED)

        self.rows = self._get_data_by_dimension()

    def _load_dsd_by_element(self):

        #FIXME: Manque codelist et concepts ?

        url = "http://www.bdm.insee.fr/series/sdmx/datastructure/INSEE/%s" % self.dsd_id
        download = Downloader(url=url,
                              filename="datastructure-%s.xml" % self.dsd_id,
                              headers=SDMX_METADATA_HEADERS,
                              store_filepath=self.store_path,
                              use_existing_file=self.fetcher.use_existing_file,
                              client=self.fetcher.requests_client)
        filepath = download.get_filepath()
        self.fetcher.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._set_dataset()

    def _load_dsd(self):
        """
        #TODO: il y a une DSD pour chaque groupe de séries (soit environ 400),
        - download 1 dsd partage par plusieurs dataset
        - 668 datase
        """

        url = "http://www.bdm.insee.fr/series/sdmx/datastructure/INSEE/%s?references=children" % self.dsd_id
        download = Downloader(url=url,
                              filename="dsd-%s.xml" % self.dsd_id,
                              headers=SDMX_METADATA_HEADERS,
                              store_filepath=self.store_path,
                              use_existing_file=self.fetcher.use_existing_file,
                              client=self.fetcher.requests_client)

        filepath, response = download.get_filepath_and_response()

        if response:
            if response.status_code == HTTP_ERROR_LONG_RESPONSE:
                self._load_dsd_by_element()
                return
            elif response.status_code >= 400:
                raise response.raise_for_status()

        if not os.path.exists(filepath):
            self._load_dsd_by_element()
            return

        self.fetcher.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._set_dataset()

    def _set_dataset(self):

        dataset = dataset_converter(self.xml_dsd, self.dataset_code, dsd_id=self.dsd_id)
        self.dataset.dimension_keys = dataset["dimension_keys"]
        self.dataset.attribute_keys = dataset["attribute_keys"]
        self.dataset.concepts = dataset["concepts"]
        self.dataset.codelists = dataset["codelists"]

    def _get_dimensions_from_dsd(self):
        return get_dimensions_from_dsd(self.xml_dsd, self.provider_name, self.dataset_code)

    def _get_data_by_dimension(self):

        dimension_keys, dimensions = self._get_dimensions_from_dsd()

        choice = "avg"
        if self.dataset_code in ["IPC-2015-COICOP"]:
            choice = "max"

        position, _key, dimension_values = select_dimension(dimension_keys,
                                                            dimensions,
                                                            choice=choice)

        count_dimensions = len(dimension_keys)

        logger.info("choice[%s] - filterkey[%s] - count[%s] - provider[%s] - dataset[%s]" % (choice, _key, len(dimension_values), self.provider_name, self.dataset_code))

        for dimension_value in dimension_values:
            '''Pour chaque valeur de la dimension, generer une key d'url'''

            key = get_key_for_dimension(count_dimensions, position, dimension_value)

            url = "http://www.bdm.insee.fr/series/sdmx/data/%s/%s" % (self.dataset_code, key)
            if self._is_good_url(url) is False:
                logger.warning("bypass not good url[%s]" % url)
                continue

            filename = "data-%s-%s.xml" % (self.dataset_code, key.replace(".", "_"))
            download = Downloader(url=url,
                                  filename=filename,
                                  store_filepath=self.store_path,
                                  use_existing_file=self.fetcher.use_existing_file,
                                  #NOT USE FOR INSEE client=self.fetcher.requests_client
                                  )
            filepath, response = download.get_filepath_and_response()

            if not response is None:
                self._add_url_cache(url, response.status_code)

            if filepath and os.path.exists(filepath):
                self.fetcher.for_delete.append(filepath)
            elif not filepath or not os.path.exists(filepath):
                continue

            if response and response.status_code == HTTP_ERROR_NO_RESULT:
                continue
            elif response and response.status_code >= 400:
                raise response.raise_for_status()

            for row, err in self.xml_data.process(filepath):
                yield row, err

            #self.dataset.update_database(save_only=True)

        yield None, None

    def _is_updated(self, bson):
        """Verify if series changes

        Return True si la series doit etre mise a jour et False si elle est a jour
        """
        if not bson["key"] in self.dataset.metadata["series_last_update"]:
            self.dataset.metadata["series_last_update"][bson["key"]] = bson.get('last_update')
            return True

        last_update = self.dataset.metadata["series_last_update"][bson["key"]]

        series_updated = bson.get('last_update')
        if not series_updated:
            return True

        if series_updated > last_update:
            return True

        return False

    def clean_field(self, bson):
        bson["attributes"].pop("IDBANK", None)
        bson = super().clean_field(bson)
        return bson

    def build_series(self, bson):
        self.dataset.add_frequency(bson["frequency"])

        if not self._is_updated(bson):
            raise errors.RejectUpdatedSeries(provider_name=self.provider_name,
                                             dataset_code=self.dataset_code,
                                             key=bson.get('key'))

        return bson