def test_version_field(self): # nosetests -s -v dlstats.tests.fetchers.test__commons:DBProviderTestCase.test_version_field self._collections_is_empty() f = Fetcher(provider_name="p1", db=self.db) with self.assertRaises(MultipleInvalid): Providers(name="p1", long_name="Provider One", region="Dreamland", website="http://www.example.com", fetcher=f) p = Providers(name="p1", long_name="Provider One", version=1, region="Dreamland", website="http://www.example.com", fetcher=f) p.update_database() self.assertEqual(self.db[constants.COL_PROVIDERS].count(), 1)
def test_unique_constraint(self): # nosetests -s -v dlstats.tests.fetchers.test__commons:DBProviderTestCase.test_unique_constraint self._collections_is_empty() f = Fetcher(provider_name="p1", db=self.db) p = Providers(name="p1", long_name="Provider One", version=1, region="Dreamland", website="http://www.example.com", fetcher=f) p.update_database() self.assertEqual(self.db[constants.COL_PROVIDERS].count(), 1) existing_provider = dict(name="p1") with self.assertRaises(DuplicateKeyError): self.db[constants.COL_PROVIDERS].insert(existing_provider) p = Providers(name="p2", long_name="Provider One", version=1, region="Dreamland", website="http://www.example.com", fetcher=f) p.update_database() self.assertEqual(self.db[constants.COL_PROVIDERS].count(), 2)
def test_update_database(self): # nosetests -s -v dlstats.tests.fetchers.test__commons:DBProviderTestCase.test_update_database self._collections_is_empty() f = Fetcher(provider_name="p1", db=self.db) p = Providers(name="p1", long_name="Provider One", version=1, region="Dreamland", website="http://www.example.com", fetcher=f) id = p.update_database() self.assertIsNotNone(id) self.assertIsInstance(id, ObjectId) self.db[constants.COL_PROVIDERS].find_one({'_id': ObjectId(id)}) bson = self.db[constants.COL_PROVIDERS].find_one({"name": "p1"}) self.assertIsNotNone(bson) self.assertEqual(bson["name"], "p1") self.assertEqual(bson["website"], "http://www.example.com")
def test_add_data_tree(self): # nosetests -s -v dlstats.tests.fetchers.test__commons:DBProviderTestCase.test_add_data_tree f = Fetcher(provider_name="p1", is_indexes=False) p = Providers(name="p1", long_name="Provider One", version=1, region="Dreamland", website="http://www.example.com", fetcher=f) self.assertEqual(len(p.data_tree), 1) p.data_tree[0]["category_code"] = p.name p.data_tree[0]["long_name"] = p.long_name p.data_tree[0]["website"] = p.website p.update_database() minimal_category = { 'category_code': "c0", 'name': "p1"} p.add_category(minimal_category) data_tree = [ {'category_code': 'p1', 'datasets': [], 'description': None, 'doc_href': 'http://www.example.com', 'exposed': False, 'last_update': None, 'name': 'p1'}, {'category_code': 'p1.c0', 'datasets': [], 'description': None, 'doc_href': None, 'exposed': False, 'last_update': None, 'name': 'p1'} ] self.assertEqual(p.data_tree, data_tree)
class FED(Fetcher): def __init__(self, db=None, **kwargs): super().__init__(provider_name='FED', db=db, **kwargs) self.provider = Providers(name=self.provider_name, long_name='Federal Reserve', version=VERSION, region='US', website='http://www.federalreserve.gov', fetcher=self) def build_data_tree(self, force_update=False): if self.provider.count_data_tree() > 1 and not force_update: return self.provider.data_tree for category_code, dataset in DATASETS.items(): category_key = self.provider.add_category({"name": dataset["name"], "category_code": category_code, "doc_href": dataset["doc_href"]}) _dataset = {"name": dataset["name"], "dataset_code": category_code} self.provider.add_dataset(_dataset, category_key) return self.provider.data_tree def upsert_dataset(self, dataset_code): start = time.time() logger.info("upsert dataset[%s] - START" % (dataset_code)) #TODO: control si existe ou update !!! dataset = Datasets(provider_name=self.provider_name, dataset_code=dataset_code, name=DATASETS[dataset_code]['name'], doc_href=DATASETS[dataset_code]['doc_href'], last_update=datetime.now(), fetcher=self) _data = FED_Data(dataset=dataset, url=DATASETS[dataset_code]['url']) dataset.series.data_iterator = _data result = dataset.update_database() _data = None end = time.time() - start logger.info("upsert dataset[%s] - END - time[%.3f seconds]" % (dataset_code, end)) return result def load_datasets_first(self): start = time.time() logger.info("datasets first load. provider[%s] - START" % (self.provider_name)) self.provider.update_database() self.upsert_data_tree() datasets_list = [d["dataset_code"] for d in self.datasets_list()] for dataset_code in datasets_list: try: self.upsert_dataset(dataset_code) except Exception as err: logger.fatal("error for dataset[%s]: %s" % (dataset_code, str(err))) end = time.time() - start logger.info("datasets first load. provider[%s] - END - time[%.3f seconds]" % (self.provider_name, end)) def load_datasets_update(self): #TODO: self.load_datasets_first()
class Esri(Fetcher): def __init__(self, db=None): super().__init__(provider_name='ESRI', db=db) self.provider = Providers(name=self.provider_name, long_name='Economic and Social Research Institute, Cabinet Office', version=VERSION, region='Japan', website='http://www.esri.cao.go.jp/index-e.html', fetcher=self) self.datasets_dict = {} self.selected_codes = ['GDP.Amount'] def build_data_tree(self, force_update=False): """Build data_tree from ESRI site parsing """ if self.provider.count_data_tree() > 1 and not force_update: return self.provider.data_tree def make_node(data,parent_key): _category = dict(name=data['name'], category_code=data['category_code']) _category_key = self.provider.add_category(_category, parent_code=parent_key) if 'children' in data: for c in data['children']: make_node(c,_category_key) if 'datasets' in data: for d in data['datasets']: self.provider.add_dataset(dict(dataset_code = d['dataset_code'], name = d['name'], last_update = d['release_date'], metadata={'url': d['url'], 'doc_href': d['doc_href']}), _category_key) try: for data in parse_esri_site(): make_node(data, self.provider_name) except Exception as err: logger.error(err) raise def get_selected_datasets(self): """Collects the dataset codes that are in data_tree below the ones indicated in "selected_codes" provided in configuration :returns: list of dict of dataset settings""" category_filter = [".*%s.*" % d for d in self.selected_codes] category_filter = "|".join(category_filter) self.selected_datasets = {d['dataset_code']: d for d in self.datasets_list(category_filter=category_filter)} return self.selected_datasets # necessary for test mock def make_url(self): return self.dataset_settings['metadata']['url'] def upsert_dataset(self, dataset_code): """Updates data in Database for selected datasets :dset: dataset_code :returns: None""" self.get_selected_datasets() start = time.time() logger.info("upsert dataset[%s] - START" % (dataset_code)) self.dataset_settings = self.selected_datasets[dataset_code] url = self.make_url() dataset = Datasets(self.provider_name,dataset_code, fetcher=self) dataset.name = self.dataset_settings['name'] dataset.doc_href = self.dataset_settings['metadata']['doc_href'] dataset.last_update = self.dataset_settings['last_update'] data_iterator = EsriData(dataset,url,filename=dataset_code) dataset.series.data_iterator = data_iterator dataset.update_database() end = time.time() - start logger.info("upsert dataset[%s] - END - time[%.3f seconds]" % (dataset_code, end)) # TO BE FINISHED def parse_sna_agenda(self): #TODO: use Downloader download = Downloader(url="http://www.esri.cao.go.jp/en/sna/kouhyou/kouhyou_top.html", filename="agenda_sna.html") with open(download.get_filepath(), 'rb') as fp: agenda = lxml.html.parse(fp) # TO BE FINISHED def get_calendar(self): datasets = [d["dataset_code"] for d in self.datasets_list()] for entry in self.parse_agenda(): if entry['dataflow_key'] in datasets: yield {'action': 'update_node', 'kwargs': {'provider_name': self.provider_name, 'dataset_code': entry['dataflow_key']}, 'period_type': 'date', 'period_kwargs': {'run_date': datetime.strptime( entry['scheduled_date'], "%d/%m/%Y %H:%M CET"), 'timezone': pytz.timezone('Asia/Tokyo') } } # TODO: load earlier versions to get revisions def load_datasets_first(self): start = time.time() logger.info("datasets first load. provider[%s] - START" % (self.provider_name)) self.provider.update_database() self.build_data_tree() self.upsert_data_tree() datasets_list = [d for d in self.get_selected_datasets().keys()] for dataset_code in datasets_list: try: self.upsert_dataset(dataset_code) except Exception as err: logger.fatal("error for dataset[%s]: %s" % (dataset_code, str(err))) end = time.time() - start logger.info("datasets first load. provider[%s] - END - time[%.3f seconds]" % (self.provider_name, end)) def load_datasets_update(self): start = time.time() logger.info("datasets first load. provider[%s] - START" % (self.provider_name)) self.provider.update_database() self.upsert_data_tree() datasets_list = [d["dataset_code"] for d in self.datasets_list()] for dataset_code in datasets_list: try: self.upsert_dataset(dataset_code) except Exception as err: logger.fatal("error for dataset[%s]: %s" % (dataset_code, str(err))) end = time.time() - start logger.info("datasets first load. provider[%s] - END - time[%.3f seconds]" % (self.provider_name, end))
class ECB(Fetcher): def __init__(self, db=None, sdmx=None, **kwargs): super().__init__(provider_name='ECB', db=db, **kwargs) if not self.provider: self.provider = Providers(name=self.provider_name, long_name='European Central Bank', version=VERSION, region='Europe', website='http://www.ecb.europa.eu', fetcher=self) self.provider.update_database() if self.provider.version != VERSION: self.provider.update_database() self.sdmx = sdmx or ECBRequest(agency=self.provider_name) self.sdmx.timeout = 90 self._dataflows = None self._categoryschemes = None self._categorisations = None def _load_structure(self, force=False): """Load structure and build data_tree """ if (self._dataflows and self._categoryschemes and self._categorisations) and not force: return '''Force URL for select only ECB agency''' categoryschemes_response = self.sdmx.get(resource_type='categoryscheme', url='http://sdw-wsrest.ecb.int/service/categoryscheme/%s?references=parentsandsiblings' % self.provider_name) self._categorisations = categoryschemes_response.msg.categorisations self._categoryschemes = categoryschemes_response.msg.categoryschemes self._dataflows = categoryschemes_response.msg.dataflows def build_data_tree(self, force_update=False): """Build data_tree from structure datas """ if self.provider.count_data_tree() > 1 and not force_update: return self.provider.data_tree self._load_structure() for category in self._categoryschemes.aslist(): _category = dict(name=category.name.en, category_code=category.id) category_key = self.provider.add_category(_category) for subcategory in category.values(): if not subcategory.id in self._categorisations: continue _subcategory = dict(name=subcategory.name.en, category_code=subcategory.id) _subcategory_key = self.provider.add_category(_subcategory, parent_code=category_key) try: _categorisation = self._categorisations[subcategory.id] for i in _categorisation: _d = self._dataflows[i.artefact.id] self.provider.add_dataset(dict(dataset_code=_d.id, name=_d.name.en), _subcategory_key) except Exception as err: logger.error(err) raise return self.provider.data_tree def parse_agenda(self): #TODO: use Downloader download = Downloader(url="http://www.ecb.europa.eu/press/calendars/statscal/html/index.en.html", filename="statscall.html") with open(download.get_filepath(), 'rb') as fp: agenda = lxml.html.parse(fp) regex_date = re.compile("Reference period: (.*)") regex_dataset = re.compile(".*Dataset: (.*)\)") entries = agenda.xpath('//div[@class="ecb-faytdd"]/*/dt | ' '//div[@class="ecb-faytdd"]/*/dd')[2:] entries = zip(entries[::2], entries[1::2]) for entry in entries: item = {} match_key = regex_dataset.match(entry[1][0].text_content()) item['dataflow_key'] = match_key.groups()[0] match_date = regex_date.match(entry[1][1].text_content()) item['reference_period'] = match_date.groups()[0] item['scheduled_date'] = entry[0].text_content().replace('\n','') yield(item) def get_calendar(self): datasets = [d["dataset_code"] for d in self.datasets_list()] for entry in self.parse_agenda(): if entry['dataflow_key'] in datasets: yield {'action': 'update_node', 'kwargs': {'provider_name': self.provider_name, 'dataset_code': entry['dataflow_key']}, 'period_type': 'date', 'period_kwargs': {'run_date': datetime.strptime( entry['scheduled_date'], "%d/%m/%Y %H:%M CET"), 'timezone': pytz.timezone('CET') } } def upsert_dataset(self, dataset_code): start = time.time() logger.info("upsert dataset[%s] - START" % (dataset_code)) #TODO: control si existe ou update !!! dataset = Datasets(provider_name=self.provider_name, dataset_code=dataset_code, name=None, doc_href=self.provider.website, last_update=datetime.now(), fetcher=self) _data = ECB_Data(dataset=dataset) dataset.series.data_iterator = _data try: result = dataset.update_database() except: raise _data = None end = time.time() - start logger.info("upsert dataset[%s] - END - time[%.3f seconds]" % (dataset_code, end)) return result def load_datasets_first(self): start = time.time() logger.info("datasets first load. provider[%s] - START" % (self.provider_name)) self._load_structure() self.provider.update_database() self.upsert_data_tree() datasets_list = [d["dataset_code"] for d in self.datasets_list()] for dataset_code in datasets_list: try: self.upsert_dataset(dataset_code) except Exception as err: logger.fatal("error for dataset[%s]: %s" % (dataset_code, str(err))) end = time.time() - start logger.info("datasets first load. provider[%s] - END - time[%.3f seconds]" % (self.provider_name, end)) def load_datasets_update(self): #TODO: self.load_datasets_first()
class BIS(Fetcher): def __init__(self, db=None): super().__init__(provider_name="BIS", db=db) if not self.provider: self.provider = Providers( name=self.provider_name, long_name="Bank for International Settlements", version=VERSION, region="world", website="http://www.bis.org", fetcher=self, ) self.provider.update_database() if self.provider.version != VERSION: self.provider.update_database() def upsert_dataset(self, dataset_code): start = time.time() logger.info("upsert dataset[%s] - START" % (dataset_code)) if not DATASETS.get(dataset_code): raise Exception("This dataset is unknown" + dataset_code) dataset = Datasets( provider_name=self.provider_name, dataset_code=dataset_code, name=DATASETS[dataset_code]["name"], doc_href=DATASETS[dataset_code]["doc_href"], fetcher=self, ) fetcher_data = BIS_Data(dataset, url=DATASETS[dataset_code]["url"], filename=DATASETS[dataset_code]["filename"]) if fetcher_data.is_updated(): dataset.series.data_iterator = fetcher_data dataset.update_database() # TODO: clean datas (file temp) end = time.time() - start logger.info("upsert dataset[%s] - END-BEFORE-METAS - time[%.3f seconds]" % (dataset_code, end)) self.update_metas(dataset_code) end = time.time() - start logger.info("upsert dataset[%s] - END - time[%.3f seconds]" % (dataset_code, end)) else: logger.info( "upsert dataset[%s] bypass because is updated from release_date[%s]" % (dataset_code, fetcher_data.release_date) ) def load_datasets_first(self): start = time.time() logger.info("first load fetcher[%s] - START" % (self.provider_name)) for dataset_code in DATASETS.keys(): self.upsert_dataset(dataset_code) end = time.time() - start logger.info("first load fetcher[%s] - END - time[%.3f seconds]" % (self.provider_name, end)) def load_datasets_update(self): self.load_datasets_first() def build_data_tree(self, force_update=False): if self.provider.count_data_tree() > 1 and not force_update: return self.provider.data_tree for category_code, dataset in DATASETS.items(): category_key = self.provider.add_category( {"name": dataset["name"], "category_code": category_code, "doc_href": dataset["doc_href"]} ) _dataset = {"name": dataset["name"], "dataset_code": category_code} self.provider.add_dataset(_dataset, category_key) return self.provider.data_tree def parse_agenda(self): agenda = etree.HTML(get_agenda()) table = agenda.find(".//table") # only one table rows = table[0].findall("tr") # skipping first row cells = rows[1].findall("td") agenda = [] months = [None, None] for c in rows[1].iterfind("td"): content = c.find("strong") if content.text is None: content = content.find("strong") months.append(datetime.datetime.strptime(content.text, "%B %Y")) agenda.append(months) ir = 2 def get_links_text(cell): txt = [] for link in cell.findall("a"): if link.text: txt.append(link.text) return txt def _get_dates(cells): item = [] for ic, c in enumerate(cells): if c.text[0] != chr(160): item.append(re.match("\d\d|\d", c.text).group(0)) else: item.append(None) return item while ir < len(rows): cells = rows[ir].findall("td") content = cells[0] if content.text is None: content = content.find("a") item = [content.text] if cells[0].get("rowspan") == "2": two_rows = True content = cells[1].find("a") item.append(content.text) offset = 2 else: two_rows = False item.append(None) offset = 1 item.extend(_get_dates(cells[offset:])) agenda.append(item) ir += 1 if two_rows: cells = rows[ir].findall("td") links = get_links_text(cells[0]) for content in links: item = [item[0]] item.append(content) item.extend(_get_dates(cells[1:])) agenda.append(item) ir += 1 return agenda def get_calendar(self): agenda = self.parse_agenda() dataset_codes = [d["dataset_code"] for d in self.datasets_list()] """First line - exclude first 2 columns (title1, title2)""" months = agenda[0][2:] """All line moins first list""" periods = agenda[1:] def _get_dataset_code(title): for key, d in DATASETS.items(): if title in d.get("agenda_titles", []): return key return None for period in periods: title = period[0] if period[1]: title = "%s %s" % (title, period[1]) dataset_code = _get_dataset_code(title) if not dataset_code: logger.info("exclude calendar action for not implemented dataset[%s]" % title) continue if not dataset_code in dataset_codes: logger.info("exclude calendar action for dataset[%s]" % title) continue days = period[2:] scheds = [d for d in zip(months, days) if not d[1] is None] for date_base, day in scheds: yield { "action": "update_node", "kwargs": {"provider_name": self.provider_name, "dataset_code": dataset_code}, "period_type": "date", "period_kwargs": { "run_date": datetime.datetime(date_base.year, date_base.month, int(day), 8, 0, 0), "timezone": pytz.country_timezones(AGENDA["country"]), }, }
class INSEE(Fetcher): def __init__(self, db=None, sdmx=None, **kwargs): super().__init__(provider_name='INSEE', db=db, **kwargs) if not self.provider: self.provider = Providers(name=self.provider_name, long_name='National Institute of Statistics and Economic Studies', version=VERSION, region='France', website='http://www.insee.fr', fetcher=self) self.provider.update_database() if self.provider.version != VERSION: self.provider.update_database() self.sdmx = sdmx or Request(agency='INSEE') self._dataflows = None self._categoryschemes = None self._categorisations = None def _load_structure(self, force=False): if self._dataflows and not force: return """ #http://www.bdm.insee.fr/series/sdmx/categoryscheme categoryscheme_response = self.sdmx.get(resource_type='categoryscheme', params={"references": None}) logger.debug(categoryscheme_response.url) self._categoryschemes = categoryscheme_response.msg.categoryschemes #http://www.bdm.insee.fr/series/sdmx/categorisation categorisation_response = self.sdmx.get(resource_type='categorisation') logger.debug(categorisation_response.url) self._categorisations = categorisation_response.msg.categorisations """ #http://www.bdm.insee.fr/series/sdmx/dataflow dataflows_response = self.sdmx.get(resource_type='dataflow') logger.debug(dataflows_response.url) self._dataflows = dataflows_response.msg.dataflows def load_datasets_first(self): start = time.time() logger.info("datasets first load. provider[%s] - START" % (self.provider_name)) for dataset_code in self.datasets_list(): try: self.upsert_dataset(dataset_code) except Exception as err: logger.fatal("error for dataset[%s]: %s" % (dataset_code, str(err))) end = time.time() - start logger.info("update fetcher[%s] - END - time[%.3f seconds]" % (self.provider_name, end)) def load_datasets_update(self): #TODO: self.load_datasets_first() def build_data_tree(self, force_update=False): """Build data_tree from structure datas """ if self.provider.count_data_tree() > 1 and not force_update: return self.provider.data_tree self._load_structure() for dataset_code, dataset in self._dataflows.items(): name = dataset.name if "en" in dataset.name: name = dataset.name.en else: name = dataset.name.fr self.provider.add_dataset(dict(dataset_code=dataset_code, name=name), self.provider_name) return self.provider.data_tree for category in self._categoryschemes.aslist(): _category = dict(name=category.name.en, category_code=category.id) category_key = self.provider.add_category(_category) for subcategory in category.values(): if not subcategory.id in self._categorisations: continue _subcategory = dict(name=subcategory.name.en, category_code=subcategory.id) _subcategory_key = self.provider.add_category(_subcategory, parent_code=category_key) try: _categorisation = self._categorisations[subcategory.id] for i in _categorisation: _d = self._dataflows[i.artefact.id] self.provider.add_dataset(dict(dataset_code=_d.id, name=_d.name.en), _subcategory_key) except Exception as err: logger.error(err) raise return self.provider.data_tree def upsert_dataset(self, dataset_code): #self.load_structure(force=False) start = time.time() logger.info("upsert dataset[%s] - START" % (dataset_code)) #if not dataset_code in self._dataflows: # raise Exception("This dataset is unknown: %s" % dataset_code) #dataflow = self._dataflows[dataset_code] #cat = self.db[constants.COL_CATEGORIES].find_one({'category_code': dataset_code}) #dataset.name = cat['name'] #dataset.doc_href = cat['doc_href'] #dataset.last_update = cat['last_update'] dataset = Datasets(provider_name=self.provider_name, dataset_code=dataset_code, #name=dataflow.name.en, doc_href=None, last_update=datetime.now(), #TODO: fetcher=self) dataset_doc = self.db[constants.COL_DATASETS].find_one({'provider_name': self.provider_name, "dataset_code": dataset_code}) insee_data = INSEE_Data(dataset=dataset, dataset_doc=dataset_doc, #dataflow=dataflow, #sdmx=self.sdmx ) dataset.series.data_iterator = insee_data result = dataset.update_database() end = time.time() - start logger.info("upsert dataset[%s] - END - time[%.3f seconds]" % (dataset_code, end)) """ > IDBANK: A définir dynamiquement sur site ? doc_href d'une serie: http://www.bdm.insee.fr/bdm2/affichageSeries?idbank=001694226 > CODE GROUPE: Balance des Paiements mensuelle - Compte de capital http://www.bdm.insee.fr/bdm2/choixCriteres?codeGroupe=1556 """ return result
class Eurostat(Fetcher): """Class for managing the SDMX endpoint from eurostat in dlstats.""" def __init__(self, **kwargs): super().__init__(provider_name='EUROSTAT', version=VERSION, **kwargs) self.provider = Providers( name=self.provider_name, long_name='Eurostat', version=VERSION, region='Europe', website='http://ec.europa.eu/eurostat', terms_of_use= 'http://ec.europa.eu/eurostat/about/our-partners/copyright', fetcher=self) self.categories_filter = [ 'nama10', 'namq_10', 'nasa_10', 'nasq_10', 'naid_10', 'nama', 'namq', 'nasa', 'nasq', 'gov', 'ert', 'irt', 'prc', 'bop', 'bop_6', 'demo', # We harvest demo because we need demo_pjanbroad. 'lfsi_act_q', 'euroind', 'pop', 'labour', ] self.url_table_of_contents = "http://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?sort=1&file=table_of_contents.xml" self.updated_catalog = False def _is_updated_catalog(self, creation_date): if not self.provider.from_db: self.provider_verify() if not self.provider.metadata: self.provider.metadata = {} if not "creation_date" in self.provider.metadata: self.provider.metadata["creation_date"] = creation_date self.provider.update_database() return True if creation_date > self.provider.metadata["creation_date"]: self.provider.metadata["creation_date"] = creation_date self.provider.update_database() return True return False def build_data_tree(self): """Builds the data tree """ download = Downloader(url=self.url_table_of_contents, filename="table_of_contents.xml", store_filepath=self.store_path, use_existing_file=self.use_existing_file) filepath = download.get_filepath() categories = [] categories_keys = [] it = etree.iterparse(filepath, events=['end'], tag="{urn:eu.europa.ec.eurostat.navtree}leaf") def is_selected(parent_codes): """parent_codes is array of category_code """ for _select in self.categories_filter: if _select in parent_codes: return True return False def get_category(category_code): for c in categories: if c["category_code"] == category_code: return c def create_categories(parent_codes, parent_titles, position): position += 1 for i in range(len(parent_codes)): category_code = parent_codes.pop() name = parent_titles.pop() all_parents = parent_codes.copy() parent = None if all_parents: parent = all_parents[-1] if not category_code in categories_keys: _category = { "provider_name": self.provider_name, "category_code": category_code, "name": name, "position": position + i, "parent": parent, 'all_parents': all_parents, "datasets": [], "doc_href": None, "metadata": None } categories_keys.append(category_code) categories.append(_category) position = 0 is_verify_creation_date = False for event, dataset in it: if is_verify_creation_date is False: _root = dataset.getroottree().getroot() creation_date_str = _root.attrib.get("creationDate") creation_date = clean_datetime( datetime.strptime(creation_date_str, '%Y%m%dT%H%M')) if self._is_updated_catalog(creation_date) is False: msg = "no update from eurostat catalog. current[%s] - db[%s]" logger.warning(msg % (creation_date, self.provider.metadata["creation_date"])) if not self.force_update: return [] is_verify_creation_date = True if not self.force_update: self.updated_catalog = True parent_codes = dataset.xpath("ancestor::nt:branch/nt:code/text()", namespaces=TABLE_OF_CONTENT_NSMAP) if not is_selected(parent_codes): continue parent_titles = dataset.xpath( "ancestor::nt:branch/nt:title[attribute::language='en']/text()", namespaces=TABLE_OF_CONTENT_NSMAP) category_code = parent_codes[-1] create_categories(parent_codes, parent_titles, position) category = get_category(category_code) name = xpath_title(dataset)[0] last_update = xpath_ds_last_update(dataset) last_modified = xpath_ds_last_modified(dataset) doc_href = xpath_ds_metadata_html(dataset) data_start = xpath_ds_data_start(dataset) data_end = xpath_ds_data_end(dataset) values = xpath_ds_values(dataset) last_update = datetime.strptime(last_update[0], '%d.%m.%Y') if last_modified: last_modified = datetime.strptime(last_modified[0], '%d.%m.%Y') last_update = max(last_update, last_modified) dataset_code = xpath_code(dataset)[0] _dataset = { "dataset_code": dataset_code, "name": name, "last_update": clean_datetime(last_update), "metadata": { "doc_href": first_element_xpath(doc_href), "data_start": first_element_xpath(data_start), "data_end": first_element_xpath(data_end), "values": int(first_element_xpath(values, default="0")), } } category["datasets"].append(_dataset) self.for_delete.append(filepath) return categories def upsert_dataset(self, dataset_code): """Updates data in Database for selected datasets """ self.get_selected_datasets() doc = self.db[constants.COL_DATASETS].find_one( { 'provider_name': self.provider_name, 'dataset_code': dataset_code }, { 'dataset_code': 1, 'last_update': 1 }) dataset_settings = self.selected_datasets[dataset_code] if doc and doc['last_update'] >= dataset_settings['last_update']: comments = "update-date[%s]" % doc['last_update'] raise errors.RejectUpdatedDataset(provider_name=self.provider_name, dataset_code=dataset_code, comments=comments) dataset = Datasets( provider_name=self.provider_name, dataset_code=dataset_code, name=dataset_settings["name"], doc_href=dataset_settings["metadata"].get("doc_href"), last_update=None, fetcher=self) dataset.last_update = dataset_settings["last_update"] dataset.series.data_iterator = EurostatData(dataset) return dataset.update_database() def get_calendar(self): yield { "action": "update-fetcher", "period_type": "cron", "kwargs": { "provider_name": self.provider_name }, "period_kwargs": { "day": '*', "hour": 11, "minute": 1, "timezone": 'Europe/Paris' } } yield { "action": "update-fetcher", "period_type": "cron", "kwargs": { "provider_name": self.provider_name }, "period_kwargs": { "day": '*', "hour": 23, "minute": 1, "timezone": 'Europe/Paris' } } def load_datasets_update(self): datasets_list = self.datasets_list() if not self.updated_catalog and not self.force_update: msg = "update aborted for updated catalog" logger.warning(msg) dataset_codes = [d["dataset_code"] for d in datasets_list] #TODO: enable ? cursor = self.db[constants.COL_DATASETS].find( { 'provider_name': self.provider_name, 'dataset_code': { '$in': dataset_codes } }, { 'dataset_code': 1, 'last_update': 1 }) selected_datasets = {s['dataset_code']: s for s in cursor} for dataset in datasets_list: dataset_code = dataset["dataset_code"] last_update_from_catalog = dataset['last_update'] last_update_from_dataset = selected_datasets.get( dataset_code, {}).get('last_update') if (dataset_code not in selected_datasets) or ( last_update_from_catalog > last_update_from_dataset): try: self.wrap_upsert_dataset(dataset_code) except Exception as err: if isinstance(err, errors.MaxErrors): raise msg = "error for provider[%s] - dataset[%s]: %s" logger.critical( msg % (self.provider_name, dataset_code, str(err))) else: msg = "bypass update - provider[%s] - dataset[%s] - last-update-dataset[%s] - last-update-catalog[%s]" logger.info( msg % (self.provider_name, dataset_code, last_update_from_dataset, last_update_from_catalog))
class DESTATIS(Fetcher): def __init__(self, db=None, **kwargs): super().__init__(provider_name='DESTATIS', db=db, **kwargs) if not self.provider: self.provider = Providers(name=self.provider_name, long_name='Statistisches Bundesamt', version=VERSION, region='Germany', website='https://www.destatis.de', fetcher=self) if self.provider.version != VERSION: self.provider.update_database() def build_data_tree(self, force_update=False): return [] """ if self.provider.count_data_tree() > 1 and not force_update: return self.provider.data_tree for category_code, dataset in DATASETS.items(): category_key = self.provider.add_category({"name": dataset["name"], "category_code": category_code, "doc_href": dataset["doc_href"]}) _dataset = {"name": dataset["name"], "dataset_code": category_code} self.provider.add_dataset(_dataset, category_key) return self.provider.data_tree """ def upsert_dataset(self, dataset_code): start = time.time() logger.info("upsert dataset[%s] - START" % (dataset_code)) #TODO: control si existe ou update !!! dataset = Datasets(provider_name=self.provider_name, dataset_code=dataset_code, name=DATASETS[dataset_code]['name'], doc_href=DATASETS[dataset_code]['doc_href'], last_update=datetime.now(), fetcher=self) _data = DESTATIS_Data(dataset=dataset, ns_tag_data=DATASETS[dataset_code]["ns_tag_data"]) dataset.series.data_iterator = _data result = dataset.update_database() _data = None end = time.time() - start logger.info("upsert dataset[%s] - END - time[%.3f seconds]" % (dataset_code, end)) return result def load_datasets_first(self): start = time.time() logger.info("datasets first load. provider[%s] - START" % (self.provider_name)) self.provider.update_database() self.upsert_data_tree() datasets_list = [d["dataset_code"] for d in self.datasets_list()] for dataset_code in datasets_list: try: self.upsert_dataset(dataset_code) except Exception as err: logger.fatal("error for dataset[%s]: %s" % (dataset_code, str(err))) end = time.time() - start logger.info("datasets first load. provider[%s] - END - time[%.3f seconds]" % (self.provider_name, end)) def load_datasets_update(self): #TODO: self.load_datasets_first()
class Eurostat(Fetcher): """Class for managing the SDMX endpoint from eurostat in dlstats.""" def __init__(self, db=None): super().__init__(provider_name='Eurostat', db=db) if not self.provider: self.provider = Providers(name=self.provider_name, long_name='Eurostat', version=VERSION, region='Europe', website='http://ec.europa.eu/eurostat', fetcher=self) self.provider.update_database() if self.provider.version != VERSION: self.provider.update_database() self.selected_codes = [ 'nama_10', 'namq_10', 'nasa_10', 'nasq_10', 'naid_10', 'nama', 'namq', 'nasa', 'nasq', 'gov', 'ert', 'irt', 'prc', 'bop', 'bop_6', 'demo_pjanbroad', 'lfsi_act_q' ] self.selected_datasets = {} self.url_table_of_contents = "http://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?sort=1&file=table_of_contents.xml" self.dataset_url = None def build_data_tree(self, force_update=False): """Builds the data tree Pour créer les categories, ne prend que les branch dont l'un des <code> de la branch se trouvent dans selected_codes Même chose pour les datasets. Prend le category_code du parent et verifie si il est dans selected_codes """ start = time.time() logger.info("build_data_tree provider[%s] - START" % self.provider_name) if self.provider.count_data_tree() > 1 and not force_update: logger.info("use existing data-tree for provider[%s]" % self.provider_name) return self.provider.data_tree filepath = self.get_table_of_contents() it = etree.iterparse(filepath, events=['end']) def is_selected(parent_codes): """parent_codes is array of category_code """ for _select in self.selected_codes: if _select in parent_codes: return True return False for event, element in it: if event == 'end': if element.tag == fixtag_toc('nt', 'branch'): for child in element.iterchildren(tag=fixtag_toc('nt', 'children')): _parent_codes = xpath_parent_codes(child) _parents = xpath_ancestor_branch(child) if not is_selected(_parent_codes): continue for parent in _parents: _parent_code = xpath_code(parent)[0] _parent_title =xpath_title(parent)[0] '''Extrait la partie gauche des categories parents''' _parent_categories = ".".join(_parent_codes[:_parent_codes.index(_parent_code)]) _category = None _parent = None if not _parent_categories or len(_parent_categories) == 0: _category = {"category_code": _parent_code, "name": _parent_title} else: _parent = self.provider._category_key(_parent_categories) _category = {"category_code": _parent_code, "name": _parent_title} try: _key = self.provider.add_category(_category, _parent) except: #Pas de capture car verifie seulement si existe pass datasets = xpath_datasets(child) for dataset in datasets: parent_codes = xpath_parent_codes(dataset) dataset_code = xpath_code(dataset)[0] category_code = self.provider._category_key(".".join(parent_codes)) '''Verifie si au moins un des category_code est dans selected_codes''' if not is_selected(parent_codes): continue name = xpath_title(dataset)[0] last_update = xpath_ds_last_update(dataset) last_modified = xpath_ds_last_modified(dataset) doc_href = xpath_ds_metadata_html(dataset) data_start = xpath_ds_data_start(dataset) data_end = xpath_ds_data_end(dataset) values = xpath_ds_values(dataset) last_update = datetime.strptime(last_update[0], '%d.%m.%Y') if last_modified: last_modified = datetime.strptime(last_modified[0], '%d.%m.%Y') last_update = max(last_update, last_modified) dataset = { "dataset_code": dataset_code, "name": name, "last_update": last_update, "metadata": { "doc_href": first_element_xpath(doc_href), "data_start": first_element_xpath(data_start), "data_end": first_element_xpath(data_end), "values": int(first_element_xpath(values, default="0")), } } self.provider.add_dataset(dataset, category_code) dataset.clear() child.clear() element.clear() end = time.time() - start logger.info("build_data_tree load provider[%s] - END - time[%.3f seconds]" % (self.provider_name, end)) return self.provider.data_tree def get_table_of_contents(self): return Downloader(url=self.url_table_of_contents, filename="table_of_contents.xml").get_filepath() def get_selected_datasets(self): """Collects the dataset codes that are in table of contents below the ones indicated in "selected_codes" provided in configuration :returns: list of dict of dataset settings""" category_filter = [".*%s.*" % d for d in self.selected_codes] category_filter = "|".join(category_filter) self.selected_datasets = {d['dataset_code']: d for d in self.datasets_list(category_filter=category_filter)} return self.selected_datasets def upsert_dataset(self, dataset_code): """Updates data in Database for selected datasets :dset: dataset_code :returns: None""" self.get_selected_datasets() start = time.time() logger.info("upsert dataset[%s] - START" % (dataset_code)) dataset_settings = self.selected_datasets[dataset_code] dataset = Datasets(provider_name=self.provider_name, dataset_code=dataset_code, name=dataset_settings["name"], doc_href=dataset_settings["metadata"].get("doc_href"), last_update=dataset_settings["last_update"], fetcher=self) data_iterator = EurostatData(dataset, filename=dataset_code) dataset.series.data_iterator = data_iterator dataset.update_database() end = time.time() - start logger.info("upsert dataset[%s] - END - time[%.3f seconds]" % (dataset_code, end)) def load_datasets_first(self): self.get_selected_datasets() start = time.time() logger.info("first load provider[%s] - START" % (self.provider_name)) for dataset_code in self.selected_datasets.keys(): try: self.upsert_dataset(dataset_code) except Exception as err: logger.fatal("error for dataset[%s]: %s" % (dataset_code, str(err))) end = time.time() - start logger.info("first load provider[%s] - END - time[%.3f seconds]" % (self.provider_name, end)) def load_datasets_update(self): self.get_selected_datasets() start = time.time() logger.info("update provider[%s] - START" % (self.provider_name)) selected_datasets = self.db[constants.COL_DATASETS].find( {'provider_name': self.provider_name, 'dataset_code': {'$in': list(self.selected_datasets.keys())}}, {'dataset_code': 1, 'last_update': 1}) selected_datasets = {s['dataset_code'] : s for s in selected_datasets} for dataset_code, dataset in self.selected_datasets.items(): if (dataset_code not in selected_datasets) or (selected_datasets[dataset_code]['last_update'] < dataset['last_update']): try: self.upsert_dataset(dataset_code) except Exception as err: logger.fatal("error for dataset[%s]: %s" % (dataset_code, str(err))) end = time.time() - start logger.info("update provider[%s] - END - time[%.3f seconds]" % (self.provider_name, end))
class Eurostat(Fetcher): """Class for managing the SDMX endpoint from eurostat in dlstats.""" def __init__(self, **kwargs): super().__init__(provider_name="EUROSTAT", version=VERSION, **kwargs) self.provider = Providers( name=self.provider_name, long_name="Eurostat", version=VERSION, region="Europe", website="http://ec.europa.eu/eurostat", terms_of_use="http://ec.europa.eu/eurostat/about/our-partners/copyright", fetcher=self, ) self.categories_filter = [ "nama_10", "namq_10", "nasa_10", "nasq_10", "naid_10", "nama", "namq", "nasa", "nasq", "gov", "ert", "irt", "prc", "bop", "bop_6", "demo_pjanbroad", "lfsi_act_q", "euroind", "pop", "labour", ] self.url_table_of_contents = "http://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?sort=1&file=table_of_contents.xml" self.updated_catalog = False def _is_updated_catalog(self, creation_date): if not self.provider.from_db: self.provider_verify() if not self.provider.metadata: self.provider.metadata = {} if not "creation_date" in self.provider.metadata: self.provider.metadata["creation_date"] = creation_date self.provider.update_database() return True if creation_date > self.provider.metadata["creation_date"]: self.provider.metadata["creation_date"] = creation_date self.provider.update_database() return True return False def build_data_tree(self): """Builds the data tree """ download = Downloader( url=self.url_table_of_contents, filename="table_of_contents.xml", store_filepath=self.store_path, use_existing_file=self.use_existing_file, ) filepath = download.get_filepath() categories = [] categories_keys = [] it = etree.iterparse(filepath, events=["end"], tag="{urn:eu.europa.ec.eurostat.navtree}leaf") def is_selected(parent_codes): """parent_codes is array of category_code """ for _select in self.categories_filter: if _select in parent_codes: return True return False def get_category(category_code): for c in categories: if c["category_code"] == category_code: return c def create_categories(parent_codes, parent_titles, position): position += 1 for i in range(len(parent_codes)): category_code = parent_codes.pop() name = parent_titles.pop() all_parents = parent_codes.copy() parent = None if all_parents: parent = all_parents[-1] if not category_code in categories_keys: _category = { "provider_name": self.provider_name, "category_code": category_code, "name": name, "position": position + i, "parent": parent, "all_parents": all_parents, "datasets": [], "doc_href": None, "metadata": None, } categories_keys.append(category_code) categories.append(_category) position = 0 is_verify_creation_date = False for event, dataset in it: if is_verify_creation_date is False: _root = dataset.getroottree().getroot() creation_date_str = _root.attrib.get("creationDate") creation_date = clean_datetime(datetime.strptime(creation_date_str, "%Y%m%dT%H%M")) if self._is_updated_catalog(creation_date) is False: msg = "no update from eurostat catalog. current[%s] - db[%s]" logger.warning(msg % (creation_date, self.provider.metadata["creation_date"])) if not self.force_update: return [] is_verify_creation_date = True if not self.force_update: self.updated_catalog = True parent_codes = dataset.xpath("ancestor::nt:branch/nt:code/text()", namespaces=TABLE_OF_CONTENT_NSMAP) if not is_selected(parent_codes): continue parent_titles = dataset.xpath( "ancestor::nt:branch/nt:title[attribute::language='en']/text()", namespaces=TABLE_OF_CONTENT_NSMAP ) category_code = parent_codes[-1] create_categories(parent_codes, parent_titles, position) category = get_category(category_code) name = xpath_title(dataset)[0] last_update = xpath_ds_last_update(dataset) last_modified = xpath_ds_last_modified(dataset) doc_href = xpath_ds_metadata_html(dataset) data_start = xpath_ds_data_start(dataset) data_end = xpath_ds_data_end(dataset) values = xpath_ds_values(dataset) last_update = datetime.strptime(last_update[0], "%d.%m.%Y") if last_modified: last_modified = datetime.strptime(last_modified[0], "%d.%m.%Y") last_update = max(last_update, last_modified) dataset_code = xpath_code(dataset)[0] _dataset = { "dataset_code": dataset_code, "name": name, "last_update": clean_datetime(last_update), "metadata": { "doc_href": first_element_xpath(doc_href), "data_start": first_element_xpath(data_start), "data_end": first_element_xpath(data_end), "values": int(first_element_xpath(values, default="0")), }, } category["datasets"].append(_dataset) self.for_delete.append(filepath) return categories def upsert_dataset(self, dataset_code): """Updates data in Database for selected datasets """ self.get_selected_datasets() doc = self.db[constants.COL_DATASETS].find_one( {"provider_name": self.provider_name, "dataset_code": dataset_code}, {"dataset_code": 1, "last_update": 1} ) dataset_settings = self.selected_datasets[dataset_code] if doc and doc["last_update"] >= dataset_settings["last_update"]: comments = "update-date[%s]" % doc["last_update"] raise errors.RejectUpdatedDataset( provider_name=self.provider_name, dataset_code=dataset_code, comments=comments ) dataset = Datasets( provider_name=self.provider_name, dataset_code=dataset_code, name=dataset_settings["name"], doc_href=dataset_settings["metadata"].get("doc_href"), last_update=None, fetcher=self, ) dataset.last_update = dataset_settings["last_update"] dataset.series.data_iterator = EurostatData(dataset) return dataset.update_database() def get_calendar(self): yield { "action": "update-fetcher", "period_type": "cron", "kwargs": {"provider_name": self.provider_name}, "period_kwargs": {"day": "*", "hour": 11, "minute": 1, "timezone": "Europe/Paris"}, } yield { "action": "update-fetcher", "period_type": "cron", "kwargs": {"provider_name": self.provider_name}, "period_kwargs": {"day": "*", "hour": 23, "minute": 1, "timezone": "Europe/Paris"}, } def load_datasets_update(self): datasets_list = self.datasets_list() if not self.updated_catalog and not self.force_update: msg = "update aborted for updated catalog" logger.warning(msg) dataset_codes = [d["dataset_code"] for d in datasets_list] # TODO: enable ? cursor = self.db[constants.COL_DATASETS].find( {"provider_name": self.provider_name, "dataset_code": {"$in": dataset_codes}}, {"dataset_code": 1, "last_update": 1}, ) selected_datasets = {s["dataset_code"]: s for s in cursor} for dataset in datasets_list: dataset_code = dataset["dataset_code"] last_update_from_catalog = dataset["last_update"] last_update_from_dataset = selected_datasets.get(dataset_code, {}).get("last_update") if (dataset_code not in selected_datasets) or (last_update_from_catalog > last_update_from_dataset): try: self.wrap_upsert_dataset(dataset_code) except Exception as err: if isinstance(err, errors.MaxErrors): raise msg = "error for provider[%s] - dataset[%s]: %s" logger.critical(msg % (self.provider_name, dataset_code, str(err))) else: msg = "bypass update - provider[%s] - dataset[%s] - last-update-dataset[%s] - last-update-catalog[%s]" logger.info( msg % (self.provider_name, dataset_code, last_update_from_dataset, last_update_from_catalog) )