예제 #1
0
class INSEE(Fetcher):
    def __init__(self, **kwargs):
        super().__init__(provider_name='INSEE', version=VERSION, **kwargs)

        self.provider = Providers(
            name=self.provider_name,
            long_name='National Institute of Statistics and Economic Studies',
            version=VERSION,
            region='France',
            website='http://www.insee.fr',
            terms_of_use=
            'http://www.insee.fr/en/service/default.asp?page=rediffusion/rediffusion.htm',
            fetcher=self)

        self.xml_sdmx = XMLSDMX(agencyID=self.provider_name,
                                store_filepath=self.store_path,
                                use_existing_file=self.use_existing_file)

        self.xml_dsd = XMLStructure(provider_name=self.provider_name,
                                    sdmx_client=self.xml_sdmx)

        self._dataflows = None
        self._categoryschemes = None
        self._categorisations = None
        self._categorisations_categories = None
        self._concepts = None
        self._codelists = OrderedDict()

        self.requests_client = requests.Session()

    def _load_structure_dataflows(self, force=False):

        if self._dataflows and not force:
            return

        self.provider_verify()

        url = "http://www.bdm.insee.fr/series/sdmx/dataflow/%s" % self.provider_name

        if self.refresh_meta is False:
            self._dataflows = self._structure_get("dataflows")

            if self._dataflows:
                self.xml_dsd.dataflows = self._dataflows
                logger.info(
                    "load structure [dataflows] from metadata for url[%s]" %
                    url)
                return

        download = Downloader(url=url,
                              filename="dataflow.xml",
                              store_filepath=self.store_path,
                              headers=SDMX_METADATA_HEADERS,
                              use_existing_file=self.use_existing_file,
                              client=self.requests_client)
        filepath = download.get_filepath()
        self.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._dataflows = self.xml_dsd.dataflows

        self._structure_put("dataflows", url, **self._dataflows)

    def _load_structure_datatree(self, force=False):

        if self._categoryschemes and self._categorisations and not force:
            return

        self._load_structure_dataflows(force)

        url = "http://www.bdm.insee.fr/series/sdmx/categoryscheme/%s" % self.provider_name
        """
        if self.refresh_meta is False:
            self._categoryschemes = self._structure_get("categoryschemes")
            if self._categoryschemes:
                logger.info("load structure [categoryschemes] from metadata for url[%s]" % url)
        """
        if not self._categoryschemes:
            download = Downloader(url=url,
                                  filename="categoryscheme.xml",
                                  store_filepath=self.store_path,
                                  headers=SDMX_METADATA_HEADERS,
                                  use_existing_file=self.use_existing_file,
                                  client=self.requests_client)
            filepath = download.get_filepath()
            self.for_delete.append(filepath)
            self.xml_dsd.process(filepath)
            self._categoryschemes = self.xml_dsd.categories
            #self._structure_put("categoryschemes", url, **self._categoryschemes)

        url = "http://www.bdm.insee.fr/series/sdmx/categorisation/%s" % self.provider_name
        """
        if self.refresh_meta is False:
            self._categorisations = self._structure_get("categorisation")
            if self._categorisations:
                self._categorisations_categories = self._structure_get("categorisations_categories")
                logger.info("load structure [categorisation] from metadata for url[%s]" % url)
        """

        if not self._categorisations:
            download = Downloader(url=url,
                                  filename="categorisation.xml",
                                  store_filepath=self.store_path,
                                  headers=SDMX_METADATA_HEADERS,
                                  use_existing_file=self.use_existing_file,
                                  client=self.requests_client)
            filepath = download.get_filepath()
            self.for_delete.append(filepath)
            self.xml_dsd.process(filepath)
            self._categorisations = self.xml_dsd.categorisations
            self._categorisations_categories = self.xml_dsd.categorisations_categories
            #self._structure_put("categorisation", url, **self._categorisations)
            #self._structure_put("categorisations_categories", url, **self._categorisations_categories)

    def _load_structure_concepts(self, force=False):

        if self._dataflows and self._concepts and not force:
            return

        self._load_structure_dataflows(force)

        url = "http://www.bdm.insee.fr/series/sdmx/conceptscheme/%s" % self.provider_name
        if self.refresh_meta is False:
            self._concepts = self._structure_get("concepts")
            if self._concepts:
                self.xml_dsd.concepts = self._concepts
                logger.info(
                    "load structure [concepts] from metadata for url[%s]" %
                    url)

        if not self._concepts:
            download = Downloader(url=url,
                                  filename="conceptscheme.xml",
                                  store_filepath=self.store_path,
                                  headers=SDMX_METADATA_HEADERS,
                                  use_existing_file=self.use_existing_file,
                                  client=self.requests_client)
            filepath = download.get_filepath()
            self.for_delete.append(filepath)
            self.xml_dsd.process(filepath)
            self._concepts = self.xml_dsd.concepts
            self._structure_put("concepts", url, **self._concepts)

    def load_datasets_first(self):
        self._load_structure_datatree()
        return super().load_datasets_first()

    def build_data_tree(self):
        """Build data_tree from structure datas
        """
        self._load_structure_datatree()

        categories = []

        position = 0
        for category_code, category in self._categoryschemes.items():
            parent_ids = self.xml_dsd.iter_parent_category_id(category)

            parent = None
            all_parents = None
            if parent_ids:
                all_parents = parent_ids.copy()
                parent = parent_ids.pop()
            else:
                position += 1

            cat = {
                "provider_name": self.provider_name,
                "category_code": category_code,
                "name": category["name"],
                "position": position,
                "parent": parent,
                "all_parents": all_parents,
                "datasets": [],
                "doc_href": None,
                "metadata": {}
            }
            if category_code in self._categorisations_categories:
                categorisation_ids = self._categorisations_categories[
                    category_code]

                for categorisation_id in categorisation_ids:
                    categorisation = self._categorisations[categorisation_id]
                    dataflow_id = categorisation["dataflow"]["id"]
                    #dataset = self.xml_dsd.dataflows[dataflow_id]
                    if not dataflow_id in self._dataflows:
                        logger.critical("dataflow not found [%s]" %
                                        dataflow_id)
                        continue
                    dataset = self._dataflows[dataflow_id]

                    cat["datasets"].append({
                        "dataset_code": dataset['id'],
                        "name": dataset["name"],
                        "last_update": None,
                        "metadata": {
                            "dsd_id": dataset["dsd_id"]
                        }
                    })

            categories.append(cat)

        return categories

    def upsert_dataset(self, dataset_code):

        self._load_structure_dataflows()
        self._load_structure_concepts()

        dataset = Datasets(provider_name=self.provider_name,
                           dataset_code=dataset_code,
                           name=None,
                           doc_href=None,
                           fetcher=self)
        dataset.last_update = clean_datetime()

        insee_data = INSEE_Data(dataset)
        dataset.series.data_iterator = insee_data

        return dataset.update_database()

    def get_calendar(self):

        datasets = {d['name']: d['dataset_code'] for d in self.datasets_list()}

        DATEEXP = re.compile(
            "(January|February|March|April|May|June|July|August|September|October|November|December)[ ]+\d+[ ]*,[ ]+\d+[ ]+\d+:\d+"
        )
        url = 'http://www.insee.fr/en/service/agendas/agenda.asp'

        d = pq(url=url, parser='html')

        for li in d('div#contenu')('ul.liens')("li.princ-ind"):
            try:

                # April 21, 2016  08:45 - INSEE
                text = pq(li)("p.info")[0].text

                _date = datetime.strptime(
                    DATEEXP.match(text).group(), '%B %d, %Y %H:%M')

                #/en/themes/indicateur.asp?id=105
                url1 = "http://www.insee.fr%s" % pq(li)("a")[0].get("href")
                page2 = pq(url=url1, parser='html')

                # 'http://www.bdm.insee.fr/bdm2/choixCriteres.action?request_locale=en&codeGroupe=1007'
                url2 = page2("div#savoirplus")('p')('a')[0].get("href")
                page3 = pq(url=url2, parser='html')

                #telechargeSDMX-ML?lien=CLIMAT-AFFAIRES&groupeLibc=CLIMAT-AFFAIRES
                dataset_code = page3("a#exportSDMX")[0].get("href").split(
                    "=")[-1]

                #print("dataset_code : ", dataset_code)

                if dataset_code in datasets:

                    yield {
                        'action': "update-dataset",
                        "kwargs": {
                            "provider_name": self.provider_name,
                            "dataset_code": dataset_code
                        },
                        "period_type": "date",
                        "period_kwargs": {
                            "run_date":
                            datetime(_date.year, _date.month, _date.day,
                                     _date.hour, _date.minute + 2, 0),
                            "timezone":
                            'Europe/Paris'
                        }
                    }

            except Exception as err:
                logger.exception(err)
예제 #2
0
파일: insee.py 프로젝트: Widukind/dlstats
class INSEE(Fetcher):

    def __init__(self, **kwargs):
        super().__init__(provider_name='INSEE', version=VERSION, **kwargs)

        self.provider = Providers(name=self.provider_name,
                                 long_name='National Institute of Statistics and Economic Studies',
                                 version=VERSION,
                                 region='France',
                                 website='http://www.insee.fr',
                                 terms_of_use='http://www.insee.fr/en/service/default.asp?page=rediffusion/rediffusion.htm',
                                 fetcher=self)

        self.xml_sdmx = XMLSDMX(agencyID=self.provider_name,
                                store_filepath=self.store_path,
                                use_existing_file=self.use_existing_file)

        self.xml_dsd = XMLStructure(provider_name=self.provider_name,
                                    sdmx_client=self.xml_sdmx)

        self._dataflows = None
        self._categoryschemes = None
        self._categorisations = None
        self._categorisations_categories = None
        self._concepts = None
        self._codelists = OrderedDict()

        self.requests_client = requests.Session()

    def _load_structure_dataflows(self, force=False):

        if self._dataflows and not force:
            return

        self.provider_verify()

        url = "http://www.bdm.insee.fr/series/sdmx/dataflow/%s" % self.provider_name

        if self.refresh_meta is False:
            self._dataflows = self._structure_get("dataflows")

            if self._dataflows:
                self.xml_dsd.dataflows = self._dataflows
                logger.info("load structure [dataflows] from metadata for url[%s]" % url)
                return

        download = Downloader(url=url,
                              filename="dataflow.xml",
                              store_filepath=self.store_path,
                              headers=SDMX_METADATA_HEADERS,
                              use_existing_file=self.use_existing_file,
                              client=self.requests_client)
        filepath = download.get_filepath()
        self.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._dataflows = self.xml_dsd.dataflows

        self._structure_put("dataflows", url, **self._dataflows)

    def _load_structure_datatree(self, force=False):

        if self._categoryschemes and self._categorisations and not force:
            return

        self._load_structure_dataflows(force)

        url = "http://www.bdm.insee.fr/series/sdmx/categoryscheme/%s" % self.provider_name
        """
        if self.refresh_meta is False:
            self._categoryschemes = self._structure_get("categoryschemes")
            if self._categoryschemes:
                logger.info("load structure [categoryschemes] from metadata for url[%s]" % url)
        """
        if not self._categoryschemes:
            download = Downloader(url=url,
                                  filename="categoryscheme.xml",
                                  store_filepath=self.store_path,
                                  headers=SDMX_METADATA_HEADERS,
                                  use_existing_file=self.use_existing_file,
                                  client=self.requests_client)
            filepath = download.get_filepath()
            self.for_delete.append(filepath)
            self.xml_dsd.process(filepath)
            self._categoryschemes = self.xml_dsd.categories
            #self._structure_put("categoryschemes", url, **self._categoryschemes)

        url = "http://www.bdm.insee.fr/series/sdmx/categorisation/%s" % self.provider_name
        """
        if self.refresh_meta is False:
            self._categorisations = self._structure_get("categorisation")
            if self._categorisations:
                self._categorisations_categories = self._structure_get("categorisations_categories")
                logger.info("load structure [categorisation] from metadata for url[%s]" % url)
        """

        if not self._categorisations:
            download = Downloader(url=url,
                                  filename="categorisation.xml",
                                  store_filepath=self.store_path,
                                  headers=SDMX_METADATA_HEADERS,
                                  use_existing_file=self.use_existing_file,
                                  client=self.requests_client)
            filepath = download.get_filepath()
            self.for_delete.append(filepath)
            self.xml_dsd.process(filepath)
            self._categorisations = self.xml_dsd.categorisations
            self._categorisations_categories = self.xml_dsd.categorisations_categories
            #self._structure_put("categorisation", url, **self._categorisations)
            #self._structure_put("categorisations_categories", url, **self._categorisations_categories)

    def _load_structure_concepts(self, force=False):

        if self._dataflows and self._concepts and not force:
            return

        self._load_structure_dataflows(force)

        url = "http://www.bdm.insee.fr/series/sdmx/conceptscheme/%s" % self.provider_name
        if self.refresh_meta is False:
            self._concepts = self._structure_get("concepts")
            if self._concepts:
                self.xml_dsd.concepts = self._concepts
                logger.info("load structure [concepts] from metadata for url[%s]" % url)

        if not self._concepts:
            download = Downloader(url=url,
                                  filename="conceptscheme.xml",
                                  store_filepath=self.store_path,
                                  headers=SDMX_METADATA_HEADERS,
                                  use_existing_file=self.use_existing_file,
                                  client=self.requests_client)
            filepath = download.get_filepath()
            self.for_delete.append(filepath)
            self.xml_dsd.process(filepath)
            self._concepts = self.xml_dsd.concepts
            self._structure_put("concepts", url, **self._concepts)

    def load_datasets_first(self):
        self._load_structure_datatree()
        return super().load_datasets_first()

    def build_data_tree(self):
        """Build data_tree from structure datas
        """
        self._load_structure_datatree()

        categories = []

        position = 0
        for category_code, category in self._categoryschemes.items():
            parent_ids = self.xml_dsd.iter_parent_category_id(category)

            parent = None
            all_parents = None
            if parent_ids:
                all_parents = parent_ids.copy()
                parent = parent_ids.pop()
            else:
                position += 1

            cat = {
                "provider_name": self.provider_name,
                "category_code": category_code,
                "name": category["name"],
                "position": position,
                "parent": parent,
                "all_parents": all_parents,
                "datasets": [],
                "doc_href": None,
                "metadata": {}
            }
            if category_code in self._categorisations_categories:
                categorisation_ids = self._categorisations_categories[category_code]

                for categorisation_id in categorisation_ids:
                    categorisation = self._categorisations[categorisation_id]
                    dataflow_id = categorisation["dataflow"]["id"]
                    #dataset = self.xml_dsd.dataflows[dataflow_id]
                    if not dataflow_id in self._dataflows:
                        logger.critical("dataflow not found [%s]" % dataflow_id)
                        continue
                    dataset = self._dataflows[dataflow_id]

                    cat["datasets"].append({
                        "dataset_code": dataset['id'],
                        "name":dataset["name"],
                        "last_update": None,
                        "metadata": {
                            "dsd_id": dataset["dsd_id"]
                        }
                    })

            categories.append(cat)

        return categories

    def upsert_dataset(self, dataset_code):

        self._load_structure_dataflows()
        self._load_structure_concepts()

        dataset = Datasets(provider_name=self.provider_name,
                           dataset_code=dataset_code,
                           name=None,
                           doc_href=None,
                           fetcher=self)
        dataset.last_update = clean_datetime()

        insee_data = INSEE_Data(dataset)
        dataset.series.data_iterator = insee_data

        return dataset.update_database()

    def get_calendar(self):

        datasets = {d['name']: d['dataset_code'] for d in self.datasets_list()}

        DATEEXP = re.compile("(January|February|March|April|May|June|July|August|September|October|November|December)[ ]+\d+[ ]*,[ ]+\d+[ ]+\d+:\d+")
        url = 'http://www.insee.fr/en/service/agendas/agenda.asp'

        d = pq(url=url, parser='html')

        for li in d('div#contenu')('ul.liens')("li.princ-ind"):
            try:

                # April 21, 2016  08:45 - INSEE
                text = pq(li)("p.info")[0].text

                _date = datetime.strptime(DATEEXP.match(text).group(),'%B %d, %Y %H:%M')

                #/en/themes/indicateur.asp?id=105
                url1 = "http://www.insee.fr%s" % pq(li)("a")[0].get("href")
                page2 = pq(url=url1, parser='html')

                # 'http://www.bdm.insee.fr/bdm2/choixCriteres.action?request_locale=en&codeGroupe=1007'
                url2 = page2("div#savoirplus")('p')('a')[0].get("href")
                page3 = pq(url=url2, parser='html')

                #telechargeSDMX-ML?lien=CLIMAT-AFFAIRES&groupeLibc=CLIMAT-AFFAIRES
                dataset_code = page3("a#exportSDMX")[0].get("href").split("=")[-1]

                #print("dataset_code : ", dataset_code)

                if dataset_code in datasets:

                    yield {'action': "update-dataset",
                           "kwargs": {"provider_name": self.provider_name,
                                      "dataset_code": dataset_code},
                           "period_type": "date",
                           "period_kwargs": {"run_date": datetime(_date.year,
                                                                  _date.month,
                                                                  _date.day,
                                                                  _date.hour,
                                                                  _date.minute+2,
                                                                  0),
                                             "timezone": 'Europe/Paris'}
                         }

            except Exception as err:
                logger.exception(err)
예제 #3
0
파일: insee.py 프로젝트: ThomasRoca/dlstats
class INSEE(Fetcher):
    
    def __init__(self, **kwargs):
        super().__init__(provider_name='INSEE', version=VERSION, **kwargs)

        self.provider = Providers(name=self.provider_name,
                                 long_name='National Institute of Statistics and Economic Studies',
                                 version=VERSION,
                                 region='France',
                                 website='http://www.insee.fr',
                                 fetcher=self)
        
        self.xml_sdmx = None
        self.xml_dsd = None
        
        self._dataflows = None
        self._categoryschemes = None
        self._categorisations = None
        self._concepts = None
        self._codelists = OrderedDict()
        
        self.requests_client = requests.Session()
                
    def _add_metadata(self):
        return
        #TODO:
        self.provider.metadata = {
            "web": {
                "remote_series": "http://www.bdm.insee.fr/bdm2/affichageSeries?idbank=%(key)s",
                "remote_datasets": "http://www.bdm.insee.fr/bdm2/affichageSeries?idbank=%(dataset_code)s",
                "remote_category": None,
            }
        }
    
    def _load_structure(self, force=False):
        
        if self._dataflows and not force:
            return

        self.xml_sdmx = XMLSDMX(agencyID=self.provider_name)
        
        self.xml_dsd = XMLStructure(provider_name=self.provider_name,
                                    sdmx_client=self.xml_sdmx)       
        
        url = "http://www.bdm.insee.fr/series/sdmx/dataflow/%s" % self.provider_name
        download = Downloader(url=url, 
                              filename="dataflow.xml",
                              store_filepath=self.store_path,
                              headers=SDMX_METADATA_HEADERS,
                              use_existing_file=self.use_existing_file)
        filepath = download.get_filepath()
        self.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._dataflows = self.xml_dsd.dataflows

        url = "http://www.bdm.insee.fr/series/sdmx/categoryscheme/%s" % self.provider_name
        download = Downloader(url=url, 
                              filename="categoryscheme.xml",
                              store_filepath=self.store_path,
                              headers=SDMX_METADATA_HEADERS,
                              use_existing_file=self.use_existing_file)
        filepath = download.get_filepath()
        self.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._categoryschemes = self.xml_dsd.categories

        url = "http://www.bdm.insee.fr/series/sdmx/categorisation/%s" % self.provider_name
        download = Downloader(url=url, 
                              filename="categorisation.xml",
                              store_filepath=self.store_path,
                              headers=SDMX_METADATA_HEADERS,
                              use_existing_file=self.use_existing_file)
        filepath = download.get_filepath()
        self.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._categorisations = self.xml_dsd.categorisations
        
        url = "http://www.bdm.insee.fr/series/sdmx/conceptscheme/%s" % self.provider_name
        download = Downloader(url=url, 
                              filename="conceptscheme.xml",
                              store_filepath=self.store_path,
                              headers=SDMX_METADATA_HEADERS,
                              use_existing_file=self.use_existing_file)
        filepath = download.get_filepath()
        self.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._concepts = self.xml_dsd.concepts
        
    def load_datasets_first(self):
        self._load_structure()
        return super().load_datasets_first()

    def build_data_tree(self):
        """Build data_tree from structure datas
        """
        self._load_structure()
        
        categories = []
        
        position = 0
        for category_code, category in self.xml_dsd.categories.items():
            parent_ids = self.xml_dsd.iter_parent_category_id(category)

            parent = None
            all_parents = None
            if parent_ids:
                all_parents = parent_ids.copy()
                parent = parent_ids.pop()
            else:
                position += 1
                
            cat = {
                "provider_name": self.provider_name,
                "category_code": category_code,
                "name": category["name"],
                "position": position,
                "parent": parent,                
                "all_parents": all_parents,
                "datasets": [],
                "doc_href": None,
                "metadata": {}
            }
            if category_code in self.xml_dsd.categorisations_categories:
                categorisation_ids = self.xml_dsd.categorisations_categories[category_code]
                
                for categorisation_id in categorisation_ids:
                    categorisation = self.xml_dsd.categorisations[categorisation_id]
                    dataflow_id = categorisation["dataflow"]["id"]
                    dataset = self.xml_dsd.dataflows[dataflow_id]
                    
                    cat["datasets"].append({
                        "dataset_code": dataset['id'], 
                        "name":dataset["name"],
                        "last_update": None,
                        "metadata": {
                            "dsd_id": dataset["dsd_id"]
                        }
                    })
                
            categories.append(cat)
            
        return categories

    def upsert_dataset(self, dataset_code):

        self._load_structure()

        dataset = Datasets(provider_name=self.provider_name, 
                           dataset_code=dataset_code,
                           name=None,
                           doc_href=None,
                           last_update=clean_datetime(),
                           fetcher=self)
        
        query = {'provider_name': self.provider_name, 
                 "dataset_code": dataset_code}        
        dataset_doc = self.db[constants.COL_DATASETS].find_one(query)
        
        insee_data = INSEE_Data(dataset,
                                dataset_doc=dataset_doc)
        dataset.series.data_iterator = insee_data
        
        return dataset.update_database()

    def get_calendar(self):
        """Parse agenda of new releases and schedule jobs"""
        
        name_list = {d['name']: d['dataset_code'] for d in self.datasets_list()}
        DATEEXP = re.compile("(January|February|March|April|May|June|July|August|September|October|November|December)[ ]+\d+[ ]*,[ ]+\d+[ ]+\d+:\d+")
        url = 'http://www.insee.fr/en/service/agendas/agenda.asp'
        page = download_page(url)
        agenda = etree.HTML(page)
        ul = agenda.find('.//div[@id="contenu"]').find('.//ul[@class="liens"]')
        for li in ul.iterfind('li'):
            text = li.find('p[@class="info"]').text
            _date = datetime.strptime(DATEEXP.match(text).group(),'%B %d, %Y %H:%M')
            href = li.find('.//a').get('href')
            groups = self._parse_theme(urljoin('http://www.insee.fr',href))
            for group in groups:
                group_info = self._parse_group_page(group['url'])
                yield {'action': "update_node",
                       "kwargs": {"provider_name": self.provider_name,
                                  "dataset_code": name_list[group_info['name']]},
                       "period_type": "date",
                       "period_kwargs": {"run_date": datetime(_date.year,
                                                              _date.month,
                                                              _date.day,
                                                              _date.hour,
                                                              _date.minute+5,
                                                              0),
                                         "timezone": pytz.country_timezones('fr')}
                     }

    def _parse_theme(self,url):
        """Find updated code group and url"""

        page = download_page(url)
        theme = etree.HTML(page)
        p = theme.find('.//div[@id="savoirplus"]').find('p')
        groups = []
        for a in p.iterfind('.//a'):
            groups.append({'code': a.text[1:],
                           'url': a.get('href')})
        return groups

    def _parse_group_page(self,url):
        """Find updated dataset code"""

        page = download_page(url)
        group = etree.HTML(page)
        div = group.find('.//div[@id="contenu"]')
        name = div.find('.//h1').text
        # this will be useful if we change the way to download INSEE data
        url = div.find('.//a[@id="exportSDMX"]').get('href')
        return({'name': name, 'url': url})
예제 #4
0
파일: ecb.py 프로젝트: ThomasRoca/dlstats
class ECB(Fetcher):
    
    def __init__(self, **kwargs):        
        super().__init__(provider_name='ECB', version=VERSION, **kwargs)

        self.provider = Providers(name=self.provider_name,
                                  long_name='European Central Bank',
                                  version=VERSION,
                                  region='Europe',
                                  website='http://www.ecb.europa.eu',
                                  fetcher=self)
    
        self.xml_sdmx = None
        self.xml_dsd = None
        
        self._dataflows = None
        self._categoryschemes = None
        self._categorisations = None
        self._concepts = None
        
        self.requests_client = requests.Session()

    def _load_structure(self, force=False):
        """Load structure and build data_tree
        """
        
        if self._dataflows and not force:
            return
        
        self.xml_dsd = XMLStructure(provider_name=self.provider_name)       
        
        url = "http://sdw-wsrest.ecb.int/service/dataflow/%s" % self.provider_name
        download = utils.Downloader(store_filepath=self.store_path,
                                    url=url, 
                                    filename="dataflow.xml",
                                    headers=SDMX_METADATA_HEADERS,
                                    use_existing_file=self.use_existing_file)
        filepath = download.get_filepath()
        self.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._dataflows = self.xml_dsd.dataflows

        url = "http://sdw-wsrest.ecb.int/service/categoryscheme/%s" % self.provider_name
        download = utils.Downloader(store_filepath=self.store_path,
                                    url=url, 
                                    filename="categoryscheme.xml",
                                    headers=SDMX_METADATA_HEADERS,
                                    use_existing_file=self.use_existing_file)
        filepath = download.get_filepath()
        self.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._categoryschemes = self.xml_dsd.categories

        url = "http://sdw-wsrest.ecb.int/service/categorisation/%s" % self.provider_name
        download = utils.Downloader(store_filepath=self.store_path,
                                    url=url, 
                                    filename="categorisation.xml",
                                    headers=SDMX_METADATA_HEADERS,
                                    use_existing_file=self.use_existing_file)
        filepath = download.get_filepath()
        self.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._categorisations = self.xml_dsd.categorisations
        
        url = "http://sdw-wsrest.ecb.int/service/conceptscheme/%s" % self.provider_name
        download = utils.Downloader(store_filepath=self.store_path,
                                    url=url, 
                                    filename="conceptscheme.xml",
                                    headers=SDMX_METADATA_HEADERS,
                                    use_existing_file=self.use_existing_file)
        filepath = download.get_filepath()
        self.for_delete.append(filepath)
        
        self.xml_dsd.process(filepath)
        self._concepts = self.xml_dsd.concepts
        
    def build_data_tree(self):

        self._load_structure()
        
        categories = []
        
        position = 0
        for category_code, category in self.xml_dsd.categories.items():
            parent_ids = self.xml_dsd.iter_parent_category_id(category)

            parent = None
            all_parents = None
            if parent_ids:
                all_parents = parent_ids.copy()
                parent = parent_ids.pop()
            else:
                position += 1
                
            cat = {
                "provider_name": self.provider_name,
                "category_code": category_code,
                "name": category["name"],
                "position": position,
                "parent": parent,
                "all_parents": all_parents, 
                "datasets": [],
                "doc_href": None,
                "metadata": {}
            }
            if category_code in self.xml_dsd.categorisations_categories:
                categorisation_ids = self.xml_dsd.categorisations_categories[category_code]
                
                for categorisation_id in categorisation_ids:
                    categorisation = self.xml_dsd.categorisations[categorisation_id]
                    dataflow_id = categorisation["dataflow"]["id"]
                    if not dataflow_id in self.xml_dsd.dataflows:
                        logger.warning("dataflow[%s] is not in xml_dsd.dataflows" % (dataflow_id))
                        continue
                        
                    dataset = self.xml_dsd.dataflows[dataflow_id]
                    
                    cat["datasets"].append({
                        "dataset_code": dataset['id'], 
                        "name":dataset["name"],
                        "last_update": None,
                        "metadata": {
                            "dsd_id": dataset["dsd_id"]
                        }
                    })
                
            categories.append(cat)
            
        return categories
        
    def _parse_agenda(self):
        download = utils.Downloader(store_filepath=self.store_path,
                              url="http://www.ecb.europa.eu/press/calendars/statscal/html/index.en.html",
                              filename="statscall.html")
        filepath = download.get_filepath()
        with open(filepath, 'rb') as fp:
            agenda = lxml.html.parse(fp)
        self.for_delete.append(filepath)
        
        regex_date = re.compile("Reference period: (.*)")
        regex_dataset = re.compile(".*Dataset: (.*)\)")
        entries = agenda.xpath('//div[@class="ecb-faytdd"]/*/dt | '
                               '//div[@class="ecb-faytdd"]/*/dd')[2:]
        entries = zip(entries[::2], entries[1::2])
        for entry in entries:
            item = {}
            match_key = regex_dataset.match(entry[1][0].text_content())
            item['dataflow_key'] = match_key.groups()[0]
            match_date = regex_date.match(entry[1][1].text_content())
            item['reference_period'] = match_date.groups()[0]
            item['scheduled_date'] = entry[0].text_content().replace('\n','')
            yield(item)

    def get_calendar(self):
        datasets = [d["dataset_code"] for d in self.datasets_list()]

        for entry in self._parse_agenda():

            if entry['dataflow_key'] in datasets:

                yield {'action': 'update_node',
                       'kwargs': {'provider_name': self.provider_name,
                                  'dataset_code': entry['dataflow_key']},
                       'period_type': 'date',
                       'period_kwargs': {'run_date': datetime.strptime(
                           entry['scheduled_date'], "%d/%m/%Y %H:%M CET"),
                           'timezone': pytz.timezone('CET')
                       }
                      }

    def upsert_dataset(self, dataset_code):
        
        self._load_structure()
        
        dataset = Datasets(provider_name=self.provider_name, 
                           dataset_code=dataset_code,
                           name=None,
                           doc_href=self.provider.website,
                           last_update=utils.clean_datetime(),
                           fetcher=self)

        _data = ECB_Data(dataset=dataset)
        dataset.series.data_iterator = _data
        return dataset.update_database()

    def load_datasets_first(self):
        self._load_structure()
        return super().load_datasets_first()
예제 #5
0
class ECB(Fetcher):
    def __init__(self, **kwargs):
        super().__init__(provider_name='ECB', version=VERSION, **kwargs)

        self.provider = Providers(
            name=self.provider_name,
            long_name='European Central Bank',
            version=VERSION,
            region='Europe',
            website='http://www.ecb.europa.eu',
            terms_of_use=
            'https://www.ecb.europa.eu/home/disclaimer/html/index.en.html',
            fetcher=self)

        self.xml_sdmx = None
        self.xml_dsd = None

        self._dataflows = None
        self._categoryschemes = None
        self._categorisations = None
        self._concepts = None

        #self.requests_client = requests.Session()

    def _load_structure(self, force=False):
        """Load structure and build data_tree
        """

        if self._dataflows and not force:
            return

        self.xml_dsd = XMLStructure(provider_name=self.provider_name)

        url = "http://sdw-wsrest.ecb.int/service/dataflow/%s" % self.provider_name
        download = utils.Downloader(store_filepath=self.store_path,
                                    url=url,
                                    filename="dataflow.xml",
                                    headers=SDMX_METADATA_HEADERS,
                                    use_existing_file=self.use_existing_file)
        filepath = download.get_filepath()
        self.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._dataflows = self.xml_dsd.dataflows

        url = "http://sdw-wsrest.ecb.int/service/categoryscheme/%s" % self.provider_name
        download = utils.Downloader(store_filepath=self.store_path,
                                    url=url,
                                    filename="categoryscheme.xml",
                                    headers=SDMX_METADATA_HEADERS,
                                    use_existing_file=self.use_existing_file)
        filepath = download.get_filepath()
        self.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._categoryschemes = self.xml_dsd.categories

        url = "http://sdw-wsrest.ecb.int/service/categorisation/%s" % self.provider_name
        download = utils.Downloader(store_filepath=self.store_path,
                                    url=url,
                                    filename="categorisation.xml",
                                    headers=SDMX_METADATA_HEADERS,
                                    use_existing_file=self.use_existing_file)
        filepath = download.get_filepath()
        self.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._categorisations = self.xml_dsd.categorisations

        url = "http://sdw-wsrest.ecb.int/service/conceptscheme/%s" % self.provider_name
        download = utils.Downloader(store_filepath=self.store_path,
                                    url=url,
                                    filename="conceptscheme.xml",
                                    headers=SDMX_METADATA_HEADERS,
                                    use_existing_file=self.use_existing_file)
        filepath = download.get_filepath()
        self.for_delete.append(filepath)

        self.xml_dsd.process(filepath)
        self._concepts = self.xml_dsd.concepts

    def build_data_tree(self):

        self._load_structure()

        categories = []

        position = 0
        for category_code, category in self.xml_dsd.categories.items():
            parent_ids = self.xml_dsd.iter_parent_category_id(category)

            parent = None
            all_parents = None
            if parent_ids:
                all_parents = parent_ids.copy()
                parent = parent_ids.pop()
            else:
                position += 1

            cat = {
                "provider_name": self.provider_name,
                "category_code": category_code,
                "name": category["name"],
                "position": position,
                "parent": parent,
                "all_parents": all_parents,
                "datasets": [],
                "doc_href": None,
                "metadata": {}
            }
            if category_code in self.xml_dsd.categorisations_categories:
                categorisation_ids = self.xml_dsd.categorisations_categories[
                    category_code]

                for categorisation_id in categorisation_ids:
                    categorisation = self.xml_dsd.categorisations[
                        categorisation_id]
                    dataflow_id = categorisation["dataflow"]["id"]
                    if not dataflow_id in self.xml_dsd.dataflows:
                        logger.warning(
                            "dataflow[%s] is not in xml_dsd.dataflows" %
                            (dataflow_id))
                        continue

                    dataset = self.xml_dsd.dataflows[dataflow_id]

                    cat["datasets"].append({
                        "dataset_code": dataset['id'],
                        "name": dataset["name"],
                        "last_update": None,
                        "metadata": {
                            "dsd_id": dataset["dsd_id"]
                        }
                    })

            if len(cat["datasets"]) > 0:
                categories.append(cat)

        return categories

    def _parse_agenda(self):
        download = utils.Downloader(
            store_filepath=self.store_path,
            url=
            "http://www.ecb.europa.eu/press/calendars/statscal/html/index.en.html",
            filename="statscall.html")
        filepath = download.get_filepath()
        with open(filepath, 'rb') as fp:
            agenda = lxml.html.parse(fp)
        self.for_delete.append(filepath)

        regex_date = re.compile("Reference period: (.*)")
        regex_dataset = re.compile(".*Dataset: (.*)\)")
        entries = agenda.xpath('//div[@class="ecb-faytdd"]/*/dt | '
                               '//div[@class="ecb-faytdd"]/*/dd')[2:]
        entries = zip(entries[::2], entries[1::2])
        for entry in entries:
            item = {}
            match_key = regex_dataset.match(entry[1][0].text_content())
            item['dataflow_key'] = match_key.groups()[0]
            match_date = regex_date.match(entry[1][1].text_content())
            item['reference_period'] = match_date.groups()[0]
            item['scheduled_date'] = entry[0].text_content().replace('\n', '')
            yield (item)

    def get_calendar(self):
        datasets = [d["dataset_code"] for d in self.datasets_list()]

        for entry in self._parse_agenda():

            if entry['dataflow_key'] in datasets:

                scheduled_date = entry.pop("scheduled_date")
                run_date = datetime.strptime(scheduled_date,
                                             "%d/%m/%Y %H:%M CET")

                yield {
                    'action': 'update-dataset',
                    'kwargs': {
                        'provider_name': self.provider_name,
                        'dataset_code': entry['dataflow_key']
                    },
                    'period_type': 'date',
                    'period_kwargs': {
                        'run_date': run_date,
                        'timezone': 'CET'
                    }
                }

    def upsert_dataset(self, dataset_code):

        self._load_structure()

        dataset = Datasets(provider_name=self.provider_name,
                           dataset_code=dataset_code,
                           name=None,
                           doc_href=self.provider.website,
                           fetcher=self)
        dataset.last_update = utils.clean_datetime()

        _data = ECB_Data(dataset=dataset)
        dataset.series.data_iterator = _data
        return dataset.update_database()

    def load_datasets_first(self):
        self._load_structure()
        return super().load_datasets_first()