def test_update_database(self): # nosetests -s -v dlstats.tests.fetchers.test__commons:DBProviderTestCase.test_update_database self._collections_is_empty() f = Fetcher(provider_name="p1", db=self.db) p = Providers(name="p1", long_name="Provider One", version=1, region="Dreamland", website="http://www.example.com", fetcher=f) id = p.update_database() self.assertIsNotNone(id) self.assertIsInstance(id, ObjectId) self.db[constants.COL_PROVIDERS].find_one({'_id': ObjectId(id)}) bson = self.db[constants.COL_PROVIDERS].find_one({"name": "p1"}) self.assertIsNotNone(bson) self.assertEqual(bson["name"], "p1") self.assertEqual(bson["website"], "http://www.example.com")
def test_version_field(self): # nosetests -s -v dlstats.tests.fetchers.test__commons:DBProviderTestCase.test_version_field self._collections_is_empty() f = Fetcher(provider_name="p1", db=self.db) with self.assertRaises(MultipleInvalid): Providers(name="p1", long_name="Provider One", region="Dreamland", website="http://www.example.com", fetcher=f) p = Providers(name="p1", long_name="Provider One", version=1, region="Dreamland", website="http://www.example.com", fetcher=f) p.update_database() self.assertEqual(self.db[constants.COL_PROVIDERS].count(), 1)
def __init__(self, db=None, **kwargs): super().__init__(provider_name='DESTATIS', db=db, **kwargs) if not self.provider: self.provider = Providers(name=self.provider_name, long_name='Statistisches Bundesamt', version=VERSION, region='Germany', website='https://www.destatis.de', fetcher=self) if self.provider.version != VERSION: self.provider.update_database()
def __init__(self, **kwargs): super().__init__(provider_name='INSEE', version=VERSION, **kwargs) self.provider = Providers( name=self.provider_name, long_name='National Institute of Statistics and Economic Studies', version=VERSION, region='France', website='http://www.insee.fr', terms_of_use= 'http://www.insee.fr/en/service/default.asp?page=rediffusion/rediffusion.htm', fetcher=self) self.xml_sdmx = XMLSDMX(agencyID=self.provider_name, store_filepath=self.store_path, use_existing_file=self.use_existing_file) self.xml_dsd = XMLStructure(provider_name=self.provider_name, sdmx_client=self.xml_sdmx) self._dataflows = None self._categoryschemes = None self._categorisations = None self._categorisations_categories = None self._concepts = None self._codelists = OrderedDict() self.requests_client = requests.Session()
def __init__(self, db=None, **kwargs): super().__init__(provider_name='DESTATIS', db=db, **kwargs) self.provider = Providers(name=self.provider_name, long_name='Statistisches Bundesamt', version=VERSION, region='Germany', website='https://www.destatis.de', fetcher=self)
def __init__(self, db=None, **kwargs): super().__init__(provider_name='OECD', db=db, **kwargs) self.provider_name = 'OECD' self.provider = Providers(name=self.provider_name, long_name='Organisation for Economic Co-operation and Development', version=VERSION, region='world', website='http://www.oecd.org', fetcher=self)
def __init__(self, db=None, **kwargs): super().__init__(provider_name='IMF', db=db, **kwargs) self.provider = Providers(name=self.provider_name, long_name="International Monetary Fund", version=VERSION, region='world', website='http://www.imf.org/', fetcher=self)
def __init__(self, db=None, **kwargs): super().__init__(provider_name='FED', db=db, **kwargs) self.provider = Providers(name=self.provider_name, long_name='Federal Reserve', version=VERSION, region='US', website='http://www.federalreserve.gov', fetcher=self)
def __init__(self, **kwargs): super().__init__(provider_name='DUMMY', version=VERSION, **kwargs) self.provider = Providers(name=self.provider_name, long_name='Dummy Fetcher', version=VERSION, region='World', website='http://www.example.org', fetcher=self)
def __init__(self, db=None): super().__init__(provider_name='WorldBank', db=db) self.provider = Providers(name=self.provider_name, long_name='World Bank', version=VERSION, region='world', website='http://www.worldbank.org/', fetcher=self)
def __init__(self, db=None): super().__init__(provider_name='ESRI', db=db) self.provider = Providers(name=self.provider_name, long_name='Economic and Social Research Institute, Cabinet Office', version=VERSION, region='Japan', website='http://www.esri.cao.go.jp/index-e.html', fetcher=self) self.datasets_dict = {} self.selected_codes = ['GDP.Amount']
def test_build_data_tree(self): # nosetests -s -v dlstats.tests.fetchers.test_ecb:FetcherTestCase.test_build_data_tree self._register_urls_data_tree() self.fetcher.build_data_tree() # self.maxDiff = None provider = self.fetcher.provider self.assertEqual(provider.count_data_tree(), 12) """ pprint(provider.data_tree) with open(DATA_TREE_FP, "w") as fp: json.dump(provider.data_tree, fp, sort_keys=False) """ new_provider = Providers(fetcher=self.fetcher, **provider.bson) with open(DATA_TREE_FP) as fp: local_data_tree = json.load(fp, object_pairs_hook=OrderedDict) new_provider.data_tree = local_data_tree # self.assertEqual(provider.data_tree, new_provider.data_tree) filter_datasets = provider.datasets(category_filter="ECB.MOBILE_NAVI.06") self.assertEqual(len(filter_datasets), 6) self.assertEqual(filter_datasets[0]["dataset_code"], "BOP") self.assertEqual(filter_datasets[-1]["dataset_code"], "TRD") for d in provider.data_tree: schemas.data_tree_schema(d) provider.update_database() doc = self.db[constants.COL_PROVIDERS].find_one({"name": self.fetcher.provider_name}) self.assertIsNotNone(doc) for i, d in enumerate(doc["data_tree"]): self.assertEqual(doc["data_tree"][i], provider.data_tree[i]) count = len(self.fetcher.datasets_list()) self.assertEqual(count, DATAFLOW_COUNT)
def __init__(self, **kwargs): super().__init__(provider_name='BDF', version=2, **kwargs) self.provider = Providers(name=self.provider_name, long_name='Banque de France', version=2, region='France', website='http://webstat.banque-france.fr/', fetcher=self) self.categories_filter = ['concept']
def __init__(self, **kwargs): super().__init__(provider_name='BIS', version=VERSION, **kwargs) self.provider = Providers( name=self.provider_name, long_name='Bank for International Settlements', version=VERSION, region='World', website='http://www.bis.org', terms_of_use='https://www.bis.org/terms_conditions.htm', fetcher=self)
def __init__(self, **kwargs): super().__init__(provider_name='ESRI', version=VERSION, **kwargs) self.provider = Providers( name=self.provider_name, long_name='Economic and Social Research Institute, Cabinet Office', version=VERSION, region='Japan', website='http://www.esri.cao.go.jp/index-e.html', fetcher=self) self.categories_filter = ['SNA']
def test_unique_constraint(self): # nosetests -s -v dlstats.tests.fetchers.test__commons:DBProviderTestCase.test_unique_constraint self._collections_is_empty() f = Fetcher(provider_name="p1", db=self.db) p = Providers(name="p1", long_name="Provider One", version=1, region="Dreamland", website="http://www.example.com", fetcher=f) p.update_database() self.assertEqual(self.db[constants.COL_PROVIDERS].count(), 1) existing_provider = dict(name="p1") with self.assertRaises(DuplicateKeyError): self.db[constants.COL_PROVIDERS].insert(existing_provider) p = Providers(name="p2", long_name="Provider One", version=1, region="Dreamland", website="http://www.example.com", fetcher=f) p.update_database() self.assertEqual(self.db[constants.COL_PROVIDERS].count(), 2)
def __init__(self, **kwargs): super().__init__(provider_name='IMF', version=VERSION, **kwargs) self.provider = Providers( name=self.provider_name, long_name="International Monetary Fund", version=VERSION, region='World', website='http://www.imf.org/', terms_of_use='http://www.imf.org/external/terms.htm', fetcher=self) self.requests_client = requests.Session()
def __init__(self, **kwargs): super().__init__(provider_name='OECD', version=VERSION, **kwargs) self.provider = Providers( name=self.provider_name, long_name='Organisation for Economic Co-operation and Development', version=VERSION, region='World', website='http://www.oecd.org', terms_of_use='http://www.oecd.org/termsandconditions/', fetcher=self) self.requests_client = requests.Session()
def __init__(self, **kwargs): super().__init__(provider_name='EUROSTAT', version=VERSION, **kwargs) self.provider = Providers( name=self.provider_name, long_name='Eurostat', version=VERSION, region='Europe', website='http://ec.europa.eu/eurostat', terms_of_use= 'http://ec.europa.eu/eurostat/about/our-partners/copyright', fetcher=self) self.categories_filter = [ 'nama10', 'namq_10', 'nasa_10', 'nasq_10', 'naid_10', 'nama', 'namq', 'nasa', 'nasq', 'gov', 'ert', 'irt', 'prc', 'bop', 'bop_6', 'demo', # We harvest demo because we need demo_pjanbroad. 'lfsi_act_q', 'euroind', 'pop', 'labour', ] self.url_table_of_contents = "http://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?sort=1&file=table_of_contents.xml" self.updated_catalog = False
def __init__(self, **kwargs): super().__init__(provider_name='BEA', version=VERSION, **kwargs) self.provider = Providers( name=self.provider_name, long_name='Bureau of Economic Analysis', region='USA', version=VERSION, website='http://www.bea.gov', terms_of_use='http://www.bea.gov/about/BEAciting.htm', fetcher=self) self._datasets_settings = None self._current_urls = {}
def __init__(self, db=None): super().__init__(provider_name="BIS", db=db) if not self.provider: self.provider = Providers( name=self.provider_name, long_name="Bank for International Settlements", version=VERSION, region="world", website="http://www.bis.org", fetcher=self, ) self.provider.update_database() if self.provider.version != VERSION: self.provider.update_database()
def test_add_data_tree(self): # nosetests -s -v dlstats.tests.fetchers.test__commons:DBProviderTestCase.test_add_data_tree f = Fetcher(provider_name="p1", is_indexes=False) p = Providers(name="p1", long_name="Provider One", version=1, region="Dreamland", website="http://www.example.com", fetcher=f) self.assertEqual(len(p.data_tree), 1) p.data_tree[0]["category_code"] = p.name p.data_tree[0]["long_name"] = p.long_name p.data_tree[0]["website"] = p.website p.update_database() minimal_category = { 'category_code': "c0", 'name': "p1"} p.add_category(minimal_category) data_tree = [ {'category_code': 'p1', 'datasets': [], 'description': None, 'doc_href': 'http://www.example.com', 'exposed': False, 'last_update': None, 'name': 'p1'}, {'category_code': 'p1.c0', 'datasets': [], 'description': None, 'doc_href': None, 'exposed': False, 'last_update': None, 'name': 'p1'} ] self.assertEqual(p.data_tree, data_tree)
def __init__(self, **kwargs): super().__init__(provider_name='ECB', version=VERSION, **kwargs) self.provider = Providers( name=self.provider_name, long_name='European Central Bank', version=VERSION, region='Europe', website='http://www.ecb.europa.eu', terms_of_use= 'https://www.ecb.europa.eu/home/disclaimer/html/index.en.html', fetcher=self) self.xml_sdmx = None self.xml_dsd = None self._dataflows = None self._categoryschemes = None self._categorisations = None self._concepts = None
def __init__(self, db=None, sdmx=None, **kwargs): super().__init__(provider_name='ECB', db=db, **kwargs) if not self.provider: self.provider = Providers(name=self.provider_name, long_name='European Central Bank', version=VERSION, region='Europe', website='http://www.ecb.europa.eu', fetcher=self) self.provider.update_database() if self.provider.version != VERSION: self.provider.update_database() self.sdmx = sdmx or ECBRequest(agency=self.provider_name) self.sdmx.timeout = 90 self._dataflows = None self._categoryschemes = None self._categorisations = None
def __init__(self, db=None, sdmx=None, **kwargs): super().__init__(provider_name='INSEE', db=db, **kwargs) if not self.provider: self.provider = Providers(name=self.provider_name, long_name='National Institute of Statistics and Economic Studies', version=VERSION, region='France', website='http://www.insee.fr', fetcher=self) self.provider.update_database() if self.provider.version != VERSION: self.provider.update_database() self.sdmx = sdmx or Request(agency='INSEE') self._dataflows = None self._categoryschemes = None self._categorisations = None
def __init__(self, **kwargs): super().__init__(provider_name='WORLDBANK', version=VERSION, **kwargs) self.provider = Providers( name=self.provider_name, long_name='World Bank', version=VERSION, region='World', website='http://www.worldbank.org/', terms_of_use='http://data.worldbank.org/summary-terms-of-use', fetcher=self) self.api_url = 'http://api.worldbank.org/v2/' self.requests_client = requests.Session() self.blacklist = [ '13', # Enterprise Surveys '26', # Corporate scorecard # datacatalog id="89" '29', # Global Social Protection '31', # Country Policy and Institutional Assessment (CPIA) '36', # Statistical Capacity Indicators # datacatalog id="8" '37', # LAC Equity Lab '41', # Country Partnership Strategy for India '44', # Readiness for Investment in Sustainable Energy (RISE) '45', # INDO-DAPOER ] """ A Exclure: economycoverage: WLD, EAP, ECA, LAC, MNA, SAS, SSA, HIC, LMY, IBRD, IDA numberofeconomies: 214 topics: mobileapp: ??? > Les données agrégés par régions sont aussi dans les countries mais avec un id="NA" dans region <wb:region id="NA">Aggregates</wb:region> """ self._available_countries = None self._available_countries_by_name = None
def __init__(self, **kwargs): super().__init__(provider_name="EUROSTAT", version=VERSION, **kwargs) self.provider = Providers( name=self.provider_name, long_name="Eurostat", version=VERSION, region="Europe", website="http://ec.europa.eu/eurostat", terms_of_use="http://ec.europa.eu/eurostat/about/our-partners/copyright", fetcher=self, ) self.categories_filter = [ "nama_10", "namq_10", "nasa_10", "nasq_10", "naid_10", "nama", "namq", "nasa", "nasq", "gov", "ert", "irt", "prc", "bop", "bop_6", "demo_pjanbroad", "lfsi_act_q", "euroind", "pop", "labour", ] self.url_table_of_contents = "http://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?sort=1&file=table_of_contents.xml" self.updated_catalog = False
def __init__(self, db=None): super().__init__(provider_name='Eurostat', db=db) if not self.provider: self.provider = Providers(name=self.provider_name, long_name='Eurostat', version=VERSION, region='Europe', website='http://ec.europa.eu/eurostat', fetcher=self) self.provider.update_database() if self.provider.version != VERSION: self.provider.update_database() self.selected_codes = [ 'nama_10', 'namq_10', 'nasa_10', 'nasq_10', 'naid_10', 'nama', 'namq', 'nasa', 'nasq', 'gov', 'ert', 'irt', 'prc', 'bop', 'bop_6', 'demo_pjanbroad', 'lfsi_act_q' ] self.selected_datasets = {} self.url_table_of_contents = "http://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?sort=1&file=table_of_contents.xml" self.dataset_url = None
def __init__(self, **kwargs): super().__init__(provider_name='EUROSTAT', version=VERSION, **kwargs) self.provider = Providers(name=self.provider_name, long_name='Eurostat', version=VERSION, region='Europe', website='http://ec.europa.eu/eurostat', terms_of_use='http://ec.europa.eu/eurostat/about/our-partners/copyright', fetcher=self) self.categories_filter = [ 'nama10', 'namq_10', 'nasa_10', 'nasq_10', 'naid_10', 'nama', 'namq', 'nasa', 'nasq', 'gov', 'ert', 'irt', 'prc', 'bop', 'bop_6', 'demo', # We harvest demo because we need demo_pjanbroad. 'lfsi_act_q', 'euroind', 'pop', 'labour', ] self.url_table_of_contents = "http://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?sort=1&file=table_of_contents.xml" self.updated_catalog = False
class BIS(Fetcher): def __init__(self, db=None): super().__init__(provider_name="BIS", db=db) if not self.provider: self.provider = Providers( name=self.provider_name, long_name="Bank for International Settlements", version=VERSION, region="world", website="http://www.bis.org", fetcher=self, ) self.provider.update_database() if self.provider.version != VERSION: self.provider.update_database() def upsert_dataset(self, dataset_code): start = time.time() logger.info("upsert dataset[%s] - START" % (dataset_code)) if not DATASETS.get(dataset_code): raise Exception("This dataset is unknown" + dataset_code) dataset = Datasets( provider_name=self.provider_name, dataset_code=dataset_code, name=DATASETS[dataset_code]["name"], doc_href=DATASETS[dataset_code]["doc_href"], fetcher=self, ) fetcher_data = BIS_Data(dataset, url=DATASETS[dataset_code]["url"], filename=DATASETS[dataset_code]["filename"]) if fetcher_data.is_updated(): dataset.series.data_iterator = fetcher_data dataset.update_database() # TODO: clean datas (file temp) end = time.time() - start logger.info("upsert dataset[%s] - END-BEFORE-METAS - time[%.3f seconds]" % (dataset_code, end)) self.update_metas(dataset_code) end = time.time() - start logger.info("upsert dataset[%s] - END - time[%.3f seconds]" % (dataset_code, end)) else: logger.info( "upsert dataset[%s] bypass because is updated from release_date[%s]" % (dataset_code, fetcher_data.release_date) ) def load_datasets_first(self): start = time.time() logger.info("first load fetcher[%s] - START" % (self.provider_name)) for dataset_code in DATASETS.keys(): self.upsert_dataset(dataset_code) end = time.time() - start logger.info("first load fetcher[%s] - END - time[%.3f seconds]" % (self.provider_name, end)) def load_datasets_update(self): self.load_datasets_first() def build_data_tree(self, force_update=False): if self.provider.count_data_tree() > 1 and not force_update: return self.provider.data_tree for category_code, dataset in DATASETS.items(): category_key = self.provider.add_category( {"name": dataset["name"], "category_code": category_code, "doc_href": dataset["doc_href"]} ) _dataset = {"name": dataset["name"], "dataset_code": category_code} self.provider.add_dataset(_dataset, category_key) return self.provider.data_tree def parse_agenda(self): agenda = etree.HTML(get_agenda()) table = agenda.find(".//table") # only one table rows = table[0].findall("tr") # skipping first row cells = rows[1].findall("td") agenda = [] months = [None, None] for c in rows[1].iterfind("td"): content = c.find("strong") if content.text is None: content = content.find("strong") months.append(datetime.datetime.strptime(content.text, "%B %Y")) agenda.append(months) ir = 2 def get_links_text(cell): txt = [] for link in cell.findall("a"): if link.text: txt.append(link.text) return txt def _get_dates(cells): item = [] for ic, c in enumerate(cells): if c.text[0] != chr(160): item.append(re.match("\d\d|\d", c.text).group(0)) else: item.append(None) return item while ir < len(rows): cells = rows[ir].findall("td") content = cells[0] if content.text is None: content = content.find("a") item = [content.text] if cells[0].get("rowspan") == "2": two_rows = True content = cells[1].find("a") item.append(content.text) offset = 2 else: two_rows = False item.append(None) offset = 1 item.extend(_get_dates(cells[offset:])) agenda.append(item) ir += 1 if two_rows: cells = rows[ir].findall("td") links = get_links_text(cells[0]) for content in links: item = [item[0]] item.append(content) item.extend(_get_dates(cells[1:])) agenda.append(item) ir += 1 return agenda def get_calendar(self): agenda = self.parse_agenda() dataset_codes = [d["dataset_code"] for d in self.datasets_list()] """First line - exclude first 2 columns (title1, title2)""" months = agenda[0][2:] """All line moins first list""" periods = agenda[1:] def _get_dataset_code(title): for key, d in DATASETS.items(): if title in d.get("agenda_titles", []): return key return None for period in periods: title = period[0] if period[1]: title = "%s %s" % (title, period[1]) dataset_code = _get_dataset_code(title) if not dataset_code: logger.info("exclude calendar action for not implemented dataset[%s]" % title) continue if not dataset_code in dataset_codes: logger.info("exclude calendar action for dataset[%s]" % title) continue days = period[2:] scheds = [d for d in zip(months, days) if not d[1] is None] for date_base, day in scheds: yield { "action": "update_node", "kwargs": {"provider_name": self.provider_name, "dataset_code": dataset_code}, "period_type": "date", "period_kwargs": { "run_date": datetime.datetime(date_base.year, date_base.month, int(day), 8, 0, 0), "timezone": pytz.country_timezones(AGENDA["country"]), }, }
class Eurostat(Fetcher): """Class for managing the SDMX endpoint from eurostat in dlstats.""" def __init__(self, **kwargs): super().__init__(provider_name="EUROSTAT", version=VERSION, **kwargs) self.provider = Providers( name=self.provider_name, long_name="Eurostat", version=VERSION, region="Europe", website="http://ec.europa.eu/eurostat", terms_of_use="http://ec.europa.eu/eurostat/about/our-partners/copyright", fetcher=self, ) self.categories_filter = [ "nama_10", "namq_10", "nasa_10", "nasq_10", "naid_10", "nama", "namq", "nasa", "nasq", "gov", "ert", "irt", "prc", "bop", "bop_6", "demo_pjanbroad", "lfsi_act_q", "euroind", "pop", "labour", ] self.url_table_of_contents = "http://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?sort=1&file=table_of_contents.xml" self.updated_catalog = False def _is_updated_catalog(self, creation_date): if not self.provider.from_db: self.provider_verify() if not self.provider.metadata: self.provider.metadata = {} if not "creation_date" in self.provider.metadata: self.provider.metadata["creation_date"] = creation_date self.provider.update_database() return True if creation_date > self.provider.metadata["creation_date"]: self.provider.metadata["creation_date"] = creation_date self.provider.update_database() return True return False def build_data_tree(self): """Builds the data tree """ download = Downloader( url=self.url_table_of_contents, filename="table_of_contents.xml", store_filepath=self.store_path, use_existing_file=self.use_existing_file, ) filepath = download.get_filepath() categories = [] categories_keys = [] it = etree.iterparse(filepath, events=["end"], tag="{urn:eu.europa.ec.eurostat.navtree}leaf") def is_selected(parent_codes): """parent_codes is array of category_code """ for _select in self.categories_filter: if _select in parent_codes: return True return False def get_category(category_code): for c in categories: if c["category_code"] == category_code: return c def create_categories(parent_codes, parent_titles, position): position += 1 for i in range(len(parent_codes)): category_code = parent_codes.pop() name = parent_titles.pop() all_parents = parent_codes.copy() parent = None if all_parents: parent = all_parents[-1] if not category_code in categories_keys: _category = { "provider_name": self.provider_name, "category_code": category_code, "name": name, "position": position + i, "parent": parent, "all_parents": all_parents, "datasets": [], "doc_href": None, "metadata": None, } categories_keys.append(category_code) categories.append(_category) position = 0 is_verify_creation_date = False for event, dataset in it: if is_verify_creation_date is False: _root = dataset.getroottree().getroot() creation_date_str = _root.attrib.get("creationDate") creation_date = clean_datetime(datetime.strptime(creation_date_str, "%Y%m%dT%H%M")) if self._is_updated_catalog(creation_date) is False: msg = "no update from eurostat catalog. current[%s] - db[%s]" logger.warning(msg % (creation_date, self.provider.metadata["creation_date"])) if not self.force_update: return [] is_verify_creation_date = True if not self.force_update: self.updated_catalog = True parent_codes = dataset.xpath("ancestor::nt:branch/nt:code/text()", namespaces=TABLE_OF_CONTENT_NSMAP) if not is_selected(parent_codes): continue parent_titles = dataset.xpath( "ancestor::nt:branch/nt:title[attribute::language='en']/text()", namespaces=TABLE_OF_CONTENT_NSMAP ) category_code = parent_codes[-1] create_categories(parent_codes, parent_titles, position) category = get_category(category_code) name = xpath_title(dataset)[0] last_update = xpath_ds_last_update(dataset) last_modified = xpath_ds_last_modified(dataset) doc_href = xpath_ds_metadata_html(dataset) data_start = xpath_ds_data_start(dataset) data_end = xpath_ds_data_end(dataset) values = xpath_ds_values(dataset) last_update = datetime.strptime(last_update[0], "%d.%m.%Y") if last_modified: last_modified = datetime.strptime(last_modified[0], "%d.%m.%Y") last_update = max(last_update, last_modified) dataset_code = xpath_code(dataset)[0] _dataset = { "dataset_code": dataset_code, "name": name, "last_update": clean_datetime(last_update), "metadata": { "doc_href": first_element_xpath(doc_href), "data_start": first_element_xpath(data_start), "data_end": first_element_xpath(data_end), "values": int(first_element_xpath(values, default="0")), }, } category["datasets"].append(_dataset) self.for_delete.append(filepath) return categories def upsert_dataset(self, dataset_code): """Updates data in Database for selected datasets """ self.get_selected_datasets() doc = self.db[constants.COL_DATASETS].find_one( {"provider_name": self.provider_name, "dataset_code": dataset_code}, {"dataset_code": 1, "last_update": 1} ) dataset_settings = self.selected_datasets[dataset_code] if doc and doc["last_update"] >= dataset_settings["last_update"]: comments = "update-date[%s]" % doc["last_update"] raise errors.RejectUpdatedDataset( provider_name=self.provider_name, dataset_code=dataset_code, comments=comments ) dataset = Datasets( provider_name=self.provider_name, dataset_code=dataset_code, name=dataset_settings["name"], doc_href=dataset_settings["metadata"].get("doc_href"), last_update=None, fetcher=self, ) dataset.last_update = dataset_settings["last_update"] dataset.series.data_iterator = EurostatData(dataset) return dataset.update_database() def get_calendar(self): yield { "action": "update-fetcher", "period_type": "cron", "kwargs": {"provider_name": self.provider_name}, "period_kwargs": {"day": "*", "hour": 11, "minute": 1, "timezone": "Europe/Paris"}, } yield { "action": "update-fetcher", "period_type": "cron", "kwargs": {"provider_name": self.provider_name}, "period_kwargs": {"day": "*", "hour": 23, "minute": 1, "timezone": "Europe/Paris"}, } def load_datasets_update(self): datasets_list = self.datasets_list() if not self.updated_catalog and not self.force_update: msg = "update aborted for updated catalog" logger.warning(msg) dataset_codes = [d["dataset_code"] for d in datasets_list] # TODO: enable ? cursor = self.db[constants.COL_DATASETS].find( {"provider_name": self.provider_name, "dataset_code": {"$in": dataset_codes}}, {"dataset_code": 1, "last_update": 1}, ) selected_datasets = {s["dataset_code"]: s for s in cursor} for dataset in datasets_list: dataset_code = dataset["dataset_code"] last_update_from_catalog = dataset["last_update"] last_update_from_dataset = selected_datasets.get(dataset_code, {}).get("last_update") if (dataset_code not in selected_datasets) or (last_update_from_catalog > last_update_from_dataset): try: self.wrap_upsert_dataset(dataset_code) except Exception as err: if isinstance(err, errors.MaxErrors): raise msg = "error for provider[%s] - dataset[%s]: %s" logger.critical(msg % (self.provider_name, dataset_code, str(err))) else: msg = "bypass update - provider[%s] - dataset[%s] - last-update-dataset[%s] - last-update-catalog[%s]" logger.info( msg % (self.provider_name, dataset_code, last_update_from_dataset, last_update_from_catalog) )
class FED(Fetcher): def __init__(self, db=None, **kwargs): super().__init__(provider_name='FED', db=db, **kwargs) self.provider = Providers(name=self.provider_name, long_name='Federal Reserve', version=VERSION, region='US', website='http://www.federalreserve.gov', fetcher=self) def build_data_tree(self, force_update=False): if self.provider.count_data_tree() > 1 and not force_update: return self.provider.data_tree for category_code, dataset in DATASETS.items(): category_key = self.provider.add_category({"name": dataset["name"], "category_code": category_code, "doc_href": dataset["doc_href"]}) _dataset = {"name": dataset["name"], "dataset_code": category_code} self.provider.add_dataset(_dataset, category_key) return self.provider.data_tree def upsert_dataset(self, dataset_code): start = time.time() logger.info("upsert dataset[%s] - START" % (dataset_code)) #TODO: control si existe ou update !!! dataset = Datasets(provider_name=self.provider_name, dataset_code=dataset_code, name=DATASETS[dataset_code]['name'], doc_href=DATASETS[dataset_code]['doc_href'], last_update=datetime.now(), fetcher=self) _data = FED_Data(dataset=dataset, url=DATASETS[dataset_code]['url']) dataset.series.data_iterator = _data result = dataset.update_database() _data = None end = time.time() - start logger.info("upsert dataset[%s] - END - time[%.3f seconds]" % (dataset_code, end)) return result def load_datasets_first(self): start = time.time() logger.info("datasets first load. provider[%s] - START" % (self.provider_name)) self.provider.update_database() self.upsert_data_tree() datasets_list = [d["dataset_code"] for d in self.datasets_list()] for dataset_code in datasets_list: try: self.upsert_dataset(dataset_code) except Exception as err: logger.fatal("error for dataset[%s]: %s" % (dataset_code, str(err))) end = time.time() - start logger.info("datasets first load. provider[%s] - END - time[%.3f seconds]" % (self.provider_name, end)) def load_datasets_update(self): #TODO: self.load_datasets_first()
class DESTATIS(Fetcher): def __init__(self, db=None, **kwargs): super().__init__(provider_name='DESTATIS', db=db, **kwargs) if not self.provider: self.provider = Providers(name=self.provider_name, long_name='Statistisches Bundesamt', version=VERSION, region='Germany', website='https://www.destatis.de', fetcher=self) if self.provider.version != VERSION: self.provider.update_database() def build_data_tree(self, force_update=False): return [] """ if self.provider.count_data_tree() > 1 and not force_update: return self.provider.data_tree for category_code, dataset in DATASETS.items(): category_key = self.provider.add_category({"name": dataset["name"], "category_code": category_code, "doc_href": dataset["doc_href"]}) _dataset = {"name": dataset["name"], "dataset_code": category_code} self.provider.add_dataset(_dataset, category_key) return self.provider.data_tree """ def upsert_dataset(self, dataset_code): start = time.time() logger.info("upsert dataset[%s] - START" % (dataset_code)) #TODO: control si existe ou update !!! dataset = Datasets(provider_name=self.provider_name, dataset_code=dataset_code, name=DATASETS[dataset_code]['name'], doc_href=DATASETS[dataset_code]['doc_href'], last_update=datetime.now(), fetcher=self) _data = DESTATIS_Data(dataset=dataset, ns_tag_data=DATASETS[dataset_code]["ns_tag_data"]) dataset.series.data_iterator = _data result = dataset.update_database() _data = None end = time.time() - start logger.info("upsert dataset[%s] - END - time[%.3f seconds]" % (dataset_code, end)) return result def load_datasets_first(self): start = time.time() logger.info("datasets first load. provider[%s] - START" % (self.provider_name)) self.provider.update_database() self.upsert_data_tree() datasets_list = [d["dataset_code"] for d in self.datasets_list()] for dataset_code in datasets_list: try: self.upsert_dataset(dataset_code) except Exception as err: logger.fatal("error for dataset[%s]: %s" % (dataset_code, str(err))) end = time.time() - start logger.info("datasets first load. provider[%s] - END - time[%.3f seconds]" % (self.provider_name, end)) def load_datasets_update(self): #TODO: self.load_datasets_first()
class Esri(Fetcher): def __init__(self, db=None): super().__init__(provider_name='ESRI', db=db) self.provider = Providers(name=self.provider_name, long_name='Economic and Social Research Institute, Cabinet Office', version=VERSION, region='Japan', website='http://www.esri.cao.go.jp/index-e.html', fetcher=self) self.datasets_dict = {} self.selected_codes = ['GDP.Amount'] def build_data_tree(self, force_update=False): """Build data_tree from ESRI site parsing """ if self.provider.count_data_tree() > 1 and not force_update: return self.provider.data_tree def make_node(data,parent_key): _category = dict(name=data['name'], category_code=data['category_code']) _category_key = self.provider.add_category(_category, parent_code=parent_key) if 'children' in data: for c in data['children']: make_node(c,_category_key) if 'datasets' in data: for d in data['datasets']: self.provider.add_dataset(dict(dataset_code = d['dataset_code'], name = d['name'], last_update = d['release_date'], metadata={'url': d['url'], 'doc_href': d['doc_href']}), _category_key) try: for data in parse_esri_site(): make_node(data, self.provider_name) except Exception as err: logger.error(err) raise def get_selected_datasets(self): """Collects the dataset codes that are in data_tree below the ones indicated in "selected_codes" provided in configuration :returns: list of dict of dataset settings""" category_filter = [".*%s.*" % d for d in self.selected_codes] category_filter = "|".join(category_filter) self.selected_datasets = {d['dataset_code']: d for d in self.datasets_list(category_filter=category_filter)} return self.selected_datasets # necessary for test mock def make_url(self): return self.dataset_settings['metadata']['url'] def upsert_dataset(self, dataset_code): """Updates data in Database for selected datasets :dset: dataset_code :returns: None""" self.get_selected_datasets() start = time.time() logger.info("upsert dataset[%s] - START" % (dataset_code)) self.dataset_settings = self.selected_datasets[dataset_code] url = self.make_url() dataset = Datasets(self.provider_name,dataset_code, fetcher=self) dataset.name = self.dataset_settings['name'] dataset.doc_href = self.dataset_settings['metadata']['doc_href'] dataset.last_update = self.dataset_settings['last_update'] data_iterator = EsriData(dataset,url,filename=dataset_code) dataset.series.data_iterator = data_iterator dataset.update_database() end = time.time() - start logger.info("upsert dataset[%s] - END - time[%.3f seconds]" % (dataset_code, end)) # TO BE FINISHED def parse_sna_agenda(self): #TODO: use Downloader download = Downloader(url="http://www.esri.cao.go.jp/en/sna/kouhyou/kouhyou_top.html", filename="agenda_sna.html") with open(download.get_filepath(), 'rb') as fp: agenda = lxml.html.parse(fp) # TO BE FINISHED def get_calendar(self): datasets = [d["dataset_code"] for d in self.datasets_list()] for entry in self.parse_agenda(): if entry['dataflow_key'] in datasets: yield {'action': 'update_node', 'kwargs': {'provider_name': self.provider_name, 'dataset_code': entry['dataflow_key']}, 'period_type': 'date', 'period_kwargs': {'run_date': datetime.strptime( entry['scheduled_date'], "%d/%m/%Y %H:%M CET"), 'timezone': pytz.timezone('Asia/Tokyo') } } # TODO: load earlier versions to get revisions def load_datasets_first(self): start = time.time() logger.info("datasets first load. provider[%s] - START" % (self.provider_name)) self.provider.update_database() self.build_data_tree() self.upsert_data_tree() datasets_list = [d for d in self.get_selected_datasets().keys()] for dataset_code in datasets_list: try: self.upsert_dataset(dataset_code) except Exception as err: logger.fatal("error for dataset[%s]: %s" % (dataset_code, str(err))) end = time.time() - start logger.info("datasets first load. provider[%s] - END - time[%.3f seconds]" % (self.provider_name, end)) def load_datasets_update(self): start = time.time() logger.info("datasets first load. provider[%s] - START" % (self.provider_name)) self.provider.update_database() self.upsert_data_tree() datasets_list = [d["dataset_code"] for d in self.datasets_list()] for dataset_code in datasets_list: try: self.upsert_dataset(dataset_code) except Exception as err: logger.fatal("error for dataset[%s]: %s" % (dataset_code, str(err))) end = time.time() - start logger.info("datasets first load. provider[%s] - END - time[%.3f seconds]" % (self.provider_name, end))
class IMF(Fetcher): def __init__(self, db=None, **kwargs): super().__init__(provider_name='IMF', db=db, **kwargs) self.provider = Providers(name=self.provider_name, long_name="International Monetary Fund", version=VERSION, region='world', website='http://www.imf.org/', fetcher=self) def upsert_all_datasets(self): start = time.time() logger.info("update fetcher[%s] - START" % (self.provider_name)) for dataset_code in DATASETS.keys(): self.upsert_dataset(dataset_code) end = time.time() - start logger.info("update fetcher[%s] - END - time[%.3f seconds]" % (self.provider_name, end)) def upsert_dataset(self, dataset_code): start = time.time() logger.info("upsert dataset[%s] - START" % (dataset_code)) if dataset_code=='WEO': for u in self.weo_urls: self.upsert_weo_issue(u, dataset_code) else: raise Exception("This dataset is unknown" + dataset_code) end = time.time() - start logger.info("upsert dataset[%s] - END - time[%.3f seconds]" % (dataset_code, end)) def datasets_list(self): return DATASETS.keys() def datasets_long_list(self): return [(key, dataset['name']) for key, dataset in DATASETS.items()] @property def weo_urls(self): webpage = requests.get('http://www.imf.org/external/ns/cs.aspx?id=28') #TODO: replace by beautifoulsoup ? html = etree.HTML(webpage.text) hrefs = html.xpath("//div[@id = 'content-main']/h4/a['href']") links = [href.values() for href in hrefs] #The last links of the WEO webpage lead to data we dont want to pull. links = links[:-16] #These are other links we don't want. links.pop(-8) links.pop(-10) links = [link[0][:-10]+'download.aspx' for link in links] output = [] for link in links: webpage = requests.get(link) html = etree.HTML(webpage.text) final_link = html.xpath("//div[@id = 'content']//table//a['href']") final_link = final_link[0].values() output.append(link[:-13]+final_link[0]) # we need to handle the issue in chronological order return(sorted(output)) def upsert_weo_issue(self, url, dataset_code): settings = DATASETS[dataset_code] dataset = Datasets(provider_name=self.provider_name, dataset_code=dataset_code, name=settings['name'], doc_href=settings['doc_href'], fetcher=self) weo_data = WeoData(dataset, url) dataset.last_update = weo_data.release_date dataset.attribute_list.update_entry('flags','e','Estimated') dataset.series.data_iterator = weo_data try: dataset.update_database() self.update_metas(dataset_code) except Exception as err: logger.error(str(err)) def upsert_categories(self): data_tree = {'name': 'IMF', 'category_code': 'imf_root', 'children': [{'name': 'WEO' , 'category_code': 'WEO', 'exposed': True, 'children': []}]} self.provider.add_data_tree(data_tree)
class WorldBank(Fetcher): def __init__(self, db=None): super().__init__(provider_name='WorldBank', db=db) self.provider = Providers(name=self.provider_name, long_name='World Bank', version=VERSION, region='world', website='http://www.worldbank.org/', fetcher=self) def upsert_categories(self): data_tree = {'name': 'World Bank', 'category_code': 'worldbank_root', 'children': [{'name': 'GEM' , 'category_code': 'GEM', 'exposed': True, 'children': []}]} self.provider.add_data_tree(data_tree) def upsert_dataset(self, dataset_code): start = time.time() logger.info("upsert dataset[%s] - START" % (dataset_code)) #TODO return the _id field of the corresponding dataset. Update the category accordingly. if dataset_code=='GEM': self.upsert_gem(dataset_code) else: raise Exception("This dataset is unknown" + dataCode) self.update_metas(dataset_code) end = time.time() - start logger.info("upsert dataset[%s] - END - time[%.3f seconds]" % (dataset_code, end)) def upsert_gem(self, dataset_code): d = DATASETS[dataset_code] url = d['url'] dataset = Datasets(provider_name=self.provider_name, dataset_code=dataset_code, name=d['name'], doc_href=d['doc_href'], fetcher=self) gem_data = GemData(dataset, url) dataset.last_update = gem_data.release_date dataset.series.data_iterator = gem_data dataset.update_database() def upsert_all_datasets(self): start = time.time() logger.info("update fetcher[%s] - START" % (self.provider_name)) self.upsert_dataset('GEM') end = time.time() - start logger.info("update fetcher[%s] - END - time[%.3f seconds]" % (self.provider_name, end)) def datasets_list(self): return DATASETS.keys() def datasets_long_list(self): return [(key, dataset['name']) for key, dataset in DATASETS.items()] def download(self, dataset_code=None, url=None): filepath_dir = os.path.abspath(os.path.join(tempfile.gettempdir(), self.provider_name)) filepath = "%s.zip" % os.path.abspath(os.path.join(filepath_dir, dataset_code)) if not os.path.exists(filepath_dir): os.makedirs(filepath_dir, exist_ok=True) if os.path.exists(filepath): os.remove(filepath) if logger.isEnabledFor(logging.INFO): logger.info("store file to [%s]" % filepath) start = time.time() try: response = requests.get(url, #TODO: timeout=self.timeout, stream=True, allow_redirects=True, verify=False) if not response.ok: msg = "download url[%s] - status_code[%s] - reason[%s]" % (url, response.status_code, response.reason) logger.error(msg) raise Exception(msg) with open(filepath,'wb') as f: for chunk in response.iter_content(): f.write(chunk) return response.headers['Last-Modified'], filepath except requests.exceptions.ConnectionError as err: raise Exception("Connection Error") except requests.exceptions.ConnectTimeout as err: raise Exception("Connect Timeout") except requests.exceptions.ReadTimeout as err: raise Exception("Read Timeout") except Exception as err: raise Exception("Not captured exception : %s" % str(err)) end = time.time() - start logger.info("download file[%s] - END - time[%.3f seconds]" % (url, end))
class ECB(Fetcher): def __init__(self, db=None, sdmx=None, **kwargs): super().__init__(provider_name='ECB', db=db, **kwargs) if not self.provider: self.provider = Providers(name=self.provider_name, long_name='European Central Bank', version=VERSION, region='Europe', website='http://www.ecb.europa.eu', fetcher=self) self.provider.update_database() if self.provider.version != VERSION: self.provider.update_database() self.sdmx = sdmx or ECBRequest(agency=self.provider_name) self.sdmx.timeout = 90 self._dataflows = None self._categoryschemes = None self._categorisations = None def _load_structure(self, force=False): """Load structure and build data_tree """ if (self._dataflows and self._categoryschemes and self._categorisations) and not force: return '''Force URL for select only ECB agency''' categoryschemes_response = self.sdmx.get(resource_type='categoryscheme', url='http://sdw-wsrest.ecb.int/service/categoryscheme/%s?references=parentsandsiblings' % self.provider_name) self._categorisations = categoryschemes_response.msg.categorisations self._categoryschemes = categoryschemes_response.msg.categoryschemes self._dataflows = categoryschemes_response.msg.dataflows def build_data_tree(self, force_update=False): """Build data_tree from structure datas """ if self.provider.count_data_tree() > 1 and not force_update: return self.provider.data_tree self._load_structure() for category in self._categoryschemes.aslist(): _category = dict(name=category.name.en, category_code=category.id) category_key = self.provider.add_category(_category) for subcategory in category.values(): if not subcategory.id in self._categorisations: continue _subcategory = dict(name=subcategory.name.en, category_code=subcategory.id) _subcategory_key = self.provider.add_category(_subcategory, parent_code=category_key) try: _categorisation = self._categorisations[subcategory.id] for i in _categorisation: _d = self._dataflows[i.artefact.id] self.provider.add_dataset(dict(dataset_code=_d.id, name=_d.name.en), _subcategory_key) except Exception as err: logger.error(err) raise return self.provider.data_tree def parse_agenda(self): #TODO: use Downloader download = Downloader(url="http://www.ecb.europa.eu/press/calendars/statscal/html/index.en.html", filename="statscall.html") with open(download.get_filepath(), 'rb') as fp: agenda = lxml.html.parse(fp) regex_date = re.compile("Reference period: (.*)") regex_dataset = re.compile(".*Dataset: (.*)\)") entries = agenda.xpath('//div[@class="ecb-faytdd"]/*/dt | ' '//div[@class="ecb-faytdd"]/*/dd')[2:] entries = zip(entries[::2], entries[1::2]) for entry in entries: item = {} match_key = regex_dataset.match(entry[1][0].text_content()) item['dataflow_key'] = match_key.groups()[0] match_date = regex_date.match(entry[1][1].text_content()) item['reference_period'] = match_date.groups()[0] item['scheduled_date'] = entry[0].text_content().replace('\n','') yield(item) def get_calendar(self): datasets = [d["dataset_code"] for d in self.datasets_list()] for entry in self.parse_agenda(): if entry['dataflow_key'] in datasets: yield {'action': 'update_node', 'kwargs': {'provider_name': self.provider_name, 'dataset_code': entry['dataflow_key']}, 'period_type': 'date', 'period_kwargs': {'run_date': datetime.strptime( entry['scheduled_date'], "%d/%m/%Y %H:%M CET"), 'timezone': pytz.timezone('CET') } } def upsert_dataset(self, dataset_code): start = time.time() logger.info("upsert dataset[%s] - START" % (dataset_code)) #TODO: control si existe ou update !!! dataset = Datasets(provider_name=self.provider_name, dataset_code=dataset_code, name=None, doc_href=self.provider.website, last_update=datetime.now(), fetcher=self) _data = ECB_Data(dataset=dataset) dataset.series.data_iterator = _data try: result = dataset.update_database() except: raise _data = None end = time.time() - start logger.info("upsert dataset[%s] - END - time[%.3f seconds]" % (dataset_code, end)) return result def load_datasets_first(self): start = time.time() logger.info("datasets first load. provider[%s] - START" % (self.provider_name)) self._load_structure() self.provider.update_database() self.upsert_data_tree() datasets_list = [d["dataset_code"] for d in self.datasets_list()] for dataset_code in datasets_list: try: self.upsert_dataset(dataset_code) except Exception as err: logger.fatal("error for dataset[%s]: %s" % (dataset_code, str(err))) end = time.time() - start logger.info("datasets first load. provider[%s] - END - time[%.3f seconds]" % (self.provider_name, end)) def load_datasets_update(self): #TODO: self.load_datasets_first()
class Eurostat(Fetcher): """Class for managing the SDMX endpoint from eurostat in dlstats.""" def __init__(self, db=None): super().__init__(provider_name='Eurostat', db=db) if not self.provider: self.provider = Providers(name=self.provider_name, long_name='Eurostat', version=VERSION, region='Europe', website='http://ec.europa.eu/eurostat', fetcher=self) self.provider.update_database() if self.provider.version != VERSION: self.provider.update_database() self.selected_codes = [ 'nama_10', 'namq_10', 'nasa_10', 'nasq_10', 'naid_10', 'nama', 'namq', 'nasa', 'nasq', 'gov', 'ert', 'irt', 'prc', 'bop', 'bop_6', 'demo_pjanbroad', 'lfsi_act_q' ] self.selected_datasets = {} self.url_table_of_contents = "http://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?sort=1&file=table_of_contents.xml" self.dataset_url = None def build_data_tree(self, force_update=False): """Builds the data tree Pour créer les categories, ne prend que les branch dont l'un des <code> de la branch se trouvent dans selected_codes Même chose pour les datasets. Prend le category_code du parent et verifie si il est dans selected_codes """ start = time.time() logger.info("build_data_tree provider[%s] - START" % self.provider_name) if self.provider.count_data_tree() > 1 and not force_update: logger.info("use existing data-tree for provider[%s]" % self.provider_name) return self.provider.data_tree filepath = self.get_table_of_contents() it = etree.iterparse(filepath, events=['end']) def is_selected(parent_codes): """parent_codes is array of category_code """ for _select in self.selected_codes: if _select in parent_codes: return True return False for event, element in it: if event == 'end': if element.tag == fixtag_toc('nt', 'branch'): for child in element.iterchildren(tag=fixtag_toc('nt', 'children')): _parent_codes = xpath_parent_codes(child) _parents = xpath_ancestor_branch(child) if not is_selected(_parent_codes): continue for parent in _parents: _parent_code = xpath_code(parent)[0] _parent_title =xpath_title(parent)[0] '''Extrait la partie gauche des categories parents''' _parent_categories = ".".join(_parent_codes[:_parent_codes.index(_parent_code)]) _category = None _parent = None if not _parent_categories or len(_parent_categories) == 0: _category = {"category_code": _parent_code, "name": _parent_title} else: _parent = self.provider._category_key(_parent_categories) _category = {"category_code": _parent_code, "name": _parent_title} try: _key = self.provider.add_category(_category, _parent) except: #Pas de capture car verifie seulement si existe pass datasets = xpath_datasets(child) for dataset in datasets: parent_codes = xpath_parent_codes(dataset) dataset_code = xpath_code(dataset)[0] category_code = self.provider._category_key(".".join(parent_codes)) '''Verifie si au moins un des category_code est dans selected_codes''' if not is_selected(parent_codes): continue name = xpath_title(dataset)[0] last_update = xpath_ds_last_update(dataset) last_modified = xpath_ds_last_modified(dataset) doc_href = xpath_ds_metadata_html(dataset) data_start = xpath_ds_data_start(dataset) data_end = xpath_ds_data_end(dataset) values = xpath_ds_values(dataset) last_update = datetime.strptime(last_update[0], '%d.%m.%Y') if last_modified: last_modified = datetime.strptime(last_modified[0], '%d.%m.%Y') last_update = max(last_update, last_modified) dataset = { "dataset_code": dataset_code, "name": name, "last_update": last_update, "metadata": { "doc_href": first_element_xpath(doc_href), "data_start": first_element_xpath(data_start), "data_end": first_element_xpath(data_end), "values": int(first_element_xpath(values, default="0")), } } self.provider.add_dataset(dataset, category_code) dataset.clear() child.clear() element.clear() end = time.time() - start logger.info("build_data_tree load provider[%s] - END - time[%.3f seconds]" % (self.provider_name, end)) return self.provider.data_tree def get_table_of_contents(self): return Downloader(url=self.url_table_of_contents, filename="table_of_contents.xml").get_filepath() def get_selected_datasets(self): """Collects the dataset codes that are in table of contents below the ones indicated in "selected_codes" provided in configuration :returns: list of dict of dataset settings""" category_filter = [".*%s.*" % d for d in self.selected_codes] category_filter = "|".join(category_filter) self.selected_datasets = {d['dataset_code']: d for d in self.datasets_list(category_filter=category_filter)} return self.selected_datasets def upsert_dataset(self, dataset_code): """Updates data in Database for selected datasets :dset: dataset_code :returns: None""" self.get_selected_datasets() start = time.time() logger.info("upsert dataset[%s] - START" % (dataset_code)) dataset_settings = self.selected_datasets[dataset_code] dataset = Datasets(provider_name=self.provider_name, dataset_code=dataset_code, name=dataset_settings["name"], doc_href=dataset_settings["metadata"].get("doc_href"), last_update=dataset_settings["last_update"], fetcher=self) data_iterator = EurostatData(dataset, filename=dataset_code) dataset.series.data_iterator = data_iterator dataset.update_database() end = time.time() - start logger.info("upsert dataset[%s] - END - time[%.3f seconds]" % (dataset_code, end)) def load_datasets_first(self): self.get_selected_datasets() start = time.time() logger.info("first load provider[%s] - START" % (self.provider_name)) for dataset_code in self.selected_datasets.keys(): try: self.upsert_dataset(dataset_code) except Exception as err: logger.fatal("error for dataset[%s]: %s" % (dataset_code, str(err))) end = time.time() - start logger.info("first load provider[%s] - END - time[%.3f seconds]" % (self.provider_name, end)) def load_datasets_update(self): self.get_selected_datasets() start = time.time() logger.info("update provider[%s] - START" % (self.provider_name)) selected_datasets = self.db[constants.COL_DATASETS].find( {'provider_name': self.provider_name, 'dataset_code': {'$in': list(self.selected_datasets.keys())}}, {'dataset_code': 1, 'last_update': 1}) selected_datasets = {s['dataset_code'] : s for s in selected_datasets} for dataset_code, dataset in self.selected_datasets.items(): if (dataset_code not in selected_datasets) or (selected_datasets[dataset_code]['last_update'] < dataset['last_update']): try: self.upsert_dataset(dataset_code) except Exception as err: logger.fatal("error for dataset[%s]: %s" % (dataset_code, str(err))) end = time.time() - start logger.info("update provider[%s] - END - time[%.3f seconds]" % (self.provider_name, end))
class INSEE(Fetcher): def __init__(self, db=None, sdmx=None, **kwargs): super().__init__(provider_name='INSEE', db=db, **kwargs) if not self.provider: self.provider = Providers(name=self.provider_name, long_name='National Institute of Statistics and Economic Studies', version=VERSION, region='France', website='http://www.insee.fr', fetcher=self) self.provider.update_database() if self.provider.version != VERSION: self.provider.update_database() self.sdmx = sdmx or Request(agency='INSEE') self._dataflows = None self._categoryschemes = None self._categorisations = None def _load_structure(self, force=False): if self._dataflows and not force: return """ #http://www.bdm.insee.fr/series/sdmx/categoryscheme categoryscheme_response = self.sdmx.get(resource_type='categoryscheme', params={"references": None}) logger.debug(categoryscheme_response.url) self._categoryschemes = categoryscheme_response.msg.categoryschemes #http://www.bdm.insee.fr/series/sdmx/categorisation categorisation_response = self.sdmx.get(resource_type='categorisation') logger.debug(categorisation_response.url) self._categorisations = categorisation_response.msg.categorisations """ #http://www.bdm.insee.fr/series/sdmx/dataflow dataflows_response = self.sdmx.get(resource_type='dataflow') logger.debug(dataflows_response.url) self._dataflows = dataflows_response.msg.dataflows def load_datasets_first(self): start = time.time() logger.info("datasets first load. provider[%s] - START" % (self.provider_name)) for dataset_code in self.datasets_list(): try: self.upsert_dataset(dataset_code) except Exception as err: logger.fatal("error for dataset[%s]: %s" % (dataset_code, str(err))) end = time.time() - start logger.info("update fetcher[%s] - END - time[%.3f seconds]" % (self.provider_name, end)) def load_datasets_update(self): #TODO: self.load_datasets_first() def build_data_tree(self, force_update=False): """Build data_tree from structure datas """ if self.provider.count_data_tree() > 1 and not force_update: return self.provider.data_tree self._load_structure() for dataset_code, dataset in self._dataflows.items(): name = dataset.name if "en" in dataset.name: name = dataset.name.en else: name = dataset.name.fr self.provider.add_dataset(dict(dataset_code=dataset_code, name=name), self.provider_name) return self.provider.data_tree for category in self._categoryschemes.aslist(): _category = dict(name=category.name.en, category_code=category.id) category_key = self.provider.add_category(_category) for subcategory in category.values(): if not subcategory.id in self._categorisations: continue _subcategory = dict(name=subcategory.name.en, category_code=subcategory.id) _subcategory_key = self.provider.add_category(_subcategory, parent_code=category_key) try: _categorisation = self._categorisations[subcategory.id] for i in _categorisation: _d = self._dataflows[i.artefact.id] self.provider.add_dataset(dict(dataset_code=_d.id, name=_d.name.en), _subcategory_key) except Exception as err: logger.error(err) raise return self.provider.data_tree def upsert_dataset(self, dataset_code): #self.load_structure(force=False) start = time.time() logger.info("upsert dataset[%s] - START" % (dataset_code)) #if not dataset_code in self._dataflows: # raise Exception("This dataset is unknown: %s" % dataset_code) #dataflow = self._dataflows[dataset_code] #cat = self.db[constants.COL_CATEGORIES].find_one({'category_code': dataset_code}) #dataset.name = cat['name'] #dataset.doc_href = cat['doc_href'] #dataset.last_update = cat['last_update'] dataset = Datasets(provider_name=self.provider_name, dataset_code=dataset_code, #name=dataflow.name.en, doc_href=None, last_update=datetime.now(), #TODO: fetcher=self) dataset_doc = self.db[constants.COL_DATASETS].find_one({'provider_name': self.provider_name, "dataset_code": dataset_code}) insee_data = INSEE_Data(dataset=dataset, dataset_doc=dataset_doc, #dataflow=dataflow, #sdmx=self.sdmx ) dataset.series.data_iterator = insee_data result = dataset.update_database() end = time.time() - start logger.info("upsert dataset[%s] - END - time[%.3f seconds]" % (dataset_code, end)) """ > IDBANK: A définir dynamiquement sur site ? doc_href d'une serie: http://www.bdm.insee.fr/bdm2/affichageSeries?idbank=001694226 > CODE GROUPE: Balance des Paiements mensuelle - Compte de capital http://www.bdm.insee.fr/bdm2/choixCriteres?codeGroupe=1556 """ return result
class OECD(Fetcher): def __init__(self, db=None, **kwargs): super().__init__(provider_name='OECD', db=db, **kwargs) self.provider_name = 'OECD' self.provider = Providers(name=self.provider_name, long_name='Organisation for Economic Co-operation and Development', version=VERSION, region='world', website='http://www.oecd.org', fetcher=self) def upsert_dataset(self, dataset_code, datas=None): start = time.time() logger.info("upsert dataset[%s] - START" % (dataset_code)) if not DATASETS.get(dataset_code): raise Exception("This dataset is unknown" + dataset_code) dataset = Datasets(provider_name=self.provider_name, dataset_code=dataset_code, name=DATASETS[dataset_code]['name'], doc_href=DATASETS[dataset_code]['doc_href'], fetcher=self) fetcher_data = OECD_Data(dataset) dataset.series.data_iterator = fetcher_data dataset.update_database() end = time.time() - start logger.info("upsert dataset[%s] - END-BEFORE-METAS - time[%.3f seconds]" % (dataset_code, end)) self.update_metas(dataset_code) end = time.time() - start logger.info("upsert dataset[%s] - END - time[%.3f seconds]" % (dataset_code, end)) def datasets_list(self): return DATASETS.keys() def datasets_long_list(self): return [(key, dataset['name']) for key, dataset in DATASETS.items()] def upsert_all_datasets(self): start = time.time() logger.info("update fetcher[%s] - START" % (self.provider_name)) for dataset_code in DATASETS.keys(): self.upsert_dataset(dataset_code) end = time.time() - start logger.info("update fetcher[%s] - END - time[%.3f seconds]" % (self.provider_name, end)) def upsert_categories(self): data_tree = {'name': 'OECD', 'category_code': 'oecd_root', 'children': []} for dataset_code in DATASETS.keys(): data_tree['children'].append({'name': DATASETS[dataset_code]['name'], 'category_code': dataset_code, 'exposed': True, 'children': None}) self.provider.add_data_tree(data_tree)
class Eurostat(Fetcher): """Class for managing the SDMX endpoint from eurostat in dlstats.""" def __init__(self, **kwargs): super().__init__(provider_name='EUROSTAT', version=VERSION, **kwargs) self.provider = Providers( name=self.provider_name, long_name='Eurostat', version=VERSION, region='Europe', website='http://ec.europa.eu/eurostat', terms_of_use= 'http://ec.europa.eu/eurostat/about/our-partners/copyright', fetcher=self) self.categories_filter = [ 'nama10', 'namq_10', 'nasa_10', 'nasq_10', 'naid_10', 'nama', 'namq', 'nasa', 'nasq', 'gov', 'ert', 'irt', 'prc', 'bop', 'bop_6', 'demo', # We harvest demo because we need demo_pjanbroad. 'lfsi_act_q', 'euroind', 'pop', 'labour', ] self.url_table_of_contents = "http://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?sort=1&file=table_of_contents.xml" self.updated_catalog = False def _is_updated_catalog(self, creation_date): if not self.provider.from_db: self.provider_verify() if not self.provider.metadata: self.provider.metadata = {} if not "creation_date" in self.provider.metadata: self.provider.metadata["creation_date"] = creation_date self.provider.update_database() return True if creation_date > self.provider.metadata["creation_date"]: self.provider.metadata["creation_date"] = creation_date self.provider.update_database() return True return False def build_data_tree(self): """Builds the data tree """ download = Downloader(url=self.url_table_of_contents, filename="table_of_contents.xml", store_filepath=self.store_path, use_existing_file=self.use_existing_file) filepath = download.get_filepath() categories = [] categories_keys = [] it = etree.iterparse(filepath, events=['end'], tag="{urn:eu.europa.ec.eurostat.navtree}leaf") def is_selected(parent_codes): """parent_codes is array of category_code """ for _select in self.categories_filter: if _select in parent_codes: return True return False def get_category(category_code): for c in categories: if c["category_code"] == category_code: return c def create_categories(parent_codes, parent_titles, position): position += 1 for i in range(len(parent_codes)): category_code = parent_codes.pop() name = parent_titles.pop() all_parents = parent_codes.copy() parent = None if all_parents: parent = all_parents[-1] if not category_code in categories_keys: _category = { "provider_name": self.provider_name, "category_code": category_code, "name": name, "position": position + i, "parent": parent, 'all_parents': all_parents, "datasets": [], "doc_href": None, "metadata": None } categories_keys.append(category_code) categories.append(_category) position = 0 is_verify_creation_date = False for event, dataset in it: if is_verify_creation_date is False: _root = dataset.getroottree().getroot() creation_date_str = _root.attrib.get("creationDate") creation_date = clean_datetime( datetime.strptime(creation_date_str, '%Y%m%dT%H%M')) if self._is_updated_catalog(creation_date) is False: msg = "no update from eurostat catalog. current[%s] - db[%s]" logger.warning(msg % (creation_date, self.provider.metadata["creation_date"])) if not self.force_update: return [] is_verify_creation_date = True if not self.force_update: self.updated_catalog = True parent_codes = dataset.xpath("ancestor::nt:branch/nt:code/text()", namespaces=TABLE_OF_CONTENT_NSMAP) if not is_selected(parent_codes): continue parent_titles = dataset.xpath( "ancestor::nt:branch/nt:title[attribute::language='en']/text()", namespaces=TABLE_OF_CONTENT_NSMAP) category_code = parent_codes[-1] create_categories(parent_codes, parent_titles, position) category = get_category(category_code) name = xpath_title(dataset)[0] last_update = xpath_ds_last_update(dataset) last_modified = xpath_ds_last_modified(dataset) doc_href = xpath_ds_metadata_html(dataset) data_start = xpath_ds_data_start(dataset) data_end = xpath_ds_data_end(dataset) values = xpath_ds_values(dataset) last_update = datetime.strptime(last_update[0], '%d.%m.%Y') if last_modified: last_modified = datetime.strptime(last_modified[0], '%d.%m.%Y') last_update = max(last_update, last_modified) dataset_code = xpath_code(dataset)[0] _dataset = { "dataset_code": dataset_code, "name": name, "last_update": clean_datetime(last_update), "metadata": { "doc_href": first_element_xpath(doc_href), "data_start": first_element_xpath(data_start), "data_end": first_element_xpath(data_end), "values": int(first_element_xpath(values, default="0")), } } category["datasets"].append(_dataset) self.for_delete.append(filepath) return categories def upsert_dataset(self, dataset_code): """Updates data in Database for selected datasets """ self.get_selected_datasets() doc = self.db[constants.COL_DATASETS].find_one( { 'provider_name': self.provider_name, 'dataset_code': dataset_code }, { 'dataset_code': 1, 'last_update': 1 }) dataset_settings = self.selected_datasets[dataset_code] if doc and doc['last_update'] >= dataset_settings['last_update']: comments = "update-date[%s]" % doc['last_update'] raise errors.RejectUpdatedDataset(provider_name=self.provider_name, dataset_code=dataset_code, comments=comments) dataset = Datasets( provider_name=self.provider_name, dataset_code=dataset_code, name=dataset_settings["name"], doc_href=dataset_settings["metadata"].get("doc_href"), last_update=None, fetcher=self) dataset.last_update = dataset_settings["last_update"] dataset.series.data_iterator = EurostatData(dataset) return dataset.update_database() def get_calendar(self): yield { "action": "update-fetcher", "period_type": "cron", "kwargs": { "provider_name": self.provider_name }, "period_kwargs": { "day": '*', "hour": 11, "minute": 1, "timezone": 'Europe/Paris' } } yield { "action": "update-fetcher", "period_type": "cron", "kwargs": { "provider_name": self.provider_name }, "period_kwargs": { "day": '*', "hour": 23, "minute": 1, "timezone": 'Europe/Paris' } } def load_datasets_update(self): datasets_list = self.datasets_list() if not self.updated_catalog and not self.force_update: msg = "update aborted for updated catalog" logger.warning(msg) dataset_codes = [d["dataset_code"] for d in datasets_list] #TODO: enable ? cursor = self.db[constants.COL_DATASETS].find( { 'provider_name': self.provider_name, 'dataset_code': { '$in': dataset_codes } }, { 'dataset_code': 1, 'last_update': 1 }) selected_datasets = {s['dataset_code']: s for s in cursor} for dataset in datasets_list: dataset_code = dataset["dataset_code"] last_update_from_catalog = dataset['last_update'] last_update_from_dataset = selected_datasets.get( dataset_code, {}).get('last_update') if (dataset_code not in selected_datasets) or ( last_update_from_catalog > last_update_from_dataset): try: self.wrap_upsert_dataset(dataset_code) except Exception as err: if isinstance(err, errors.MaxErrors): raise msg = "error for provider[%s] - dataset[%s]: %s" logger.critical( msg % (self.provider_name, dataset_code, str(err))) else: msg = "bypass update - provider[%s] - dataset[%s] - last-update-dataset[%s] - last-update-catalog[%s]" logger.info( msg % (self.provider_name, dataset_code, last_update_from_dataset, last_update_from_catalog))