def test_add_data_tree(self): # nosetests -s -v dlstats.tests.fetchers.test__commons:DBProviderTestCase.test_add_data_tree f = Fetcher(provider_name="p1", is_indexes=False) p = Providers(name="p1", long_name="Provider One", version=1, region="Dreamland", website="http://www.example.com", fetcher=f) self.assertEqual(len(p.data_tree), 1) p.data_tree[0]["category_code"] = p.name p.data_tree[0]["long_name"] = p.long_name p.data_tree[0]["website"] = p.website p.update_database() minimal_category = { 'category_code': "c0", 'name': "p1"} p.add_category(minimal_category) data_tree = [ {'category_code': 'p1', 'datasets': [], 'description': None, 'doc_href': 'http://www.example.com', 'exposed': False, 'last_update': None, 'name': 'p1'}, {'category_code': 'p1.c0', 'datasets': [], 'description': None, 'doc_href': None, 'exposed': False, 'last_update': None, 'name': 'p1'} ] self.assertEqual(p.data_tree, data_tree)
class FED(Fetcher): def __init__(self, db=None, **kwargs): super().__init__(provider_name='FED', db=db, **kwargs) self.provider = Providers(name=self.provider_name, long_name='Federal Reserve', version=VERSION, region='US', website='http://www.federalreserve.gov', fetcher=self) def build_data_tree(self, force_update=False): if self.provider.count_data_tree() > 1 and not force_update: return self.provider.data_tree for category_code, dataset in DATASETS.items(): category_key = self.provider.add_category({"name": dataset["name"], "category_code": category_code, "doc_href": dataset["doc_href"]}) _dataset = {"name": dataset["name"], "dataset_code": category_code} self.provider.add_dataset(_dataset, category_key) return self.provider.data_tree def upsert_dataset(self, dataset_code): start = time.time() logger.info("upsert dataset[%s] - START" % (dataset_code)) #TODO: control si existe ou update !!! dataset = Datasets(provider_name=self.provider_name, dataset_code=dataset_code, name=DATASETS[dataset_code]['name'], doc_href=DATASETS[dataset_code]['doc_href'], last_update=datetime.now(), fetcher=self) _data = FED_Data(dataset=dataset, url=DATASETS[dataset_code]['url']) dataset.series.data_iterator = _data result = dataset.update_database() _data = None end = time.time() - start logger.info("upsert dataset[%s] - END - time[%.3f seconds]" % (dataset_code, end)) return result def load_datasets_first(self): start = time.time() logger.info("datasets first load. provider[%s] - START" % (self.provider_name)) self.provider.update_database() self.upsert_data_tree() datasets_list = [d["dataset_code"] for d in self.datasets_list()] for dataset_code in datasets_list: try: self.upsert_dataset(dataset_code) except Exception as err: logger.fatal("error for dataset[%s]: %s" % (dataset_code, str(err))) end = time.time() - start logger.info("datasets first load. provider[%s] - END - time[%.3f seconds]" % (self.provider_name, end)) def load_datasets_update(self): #TODO: self.load_datasets_first()
class Esri(Fetcher): def __init__(self, db=None): super().__init__(provider_name='ESRI', db=db) self.provider = Providers(name=self.provider_name, long_name='Economic and Social Research Institute, Cabinet Office', version=VERSION, region='Japan', website='http://www.esri.cao.go.jp/index-e.html', fetcher=self) self.datasets_dict = {} self.selected_codes = ['GDP.Amount'] def build_data_tree(self, force_update=False): """Build data_tree from ESRI site parsing """ if self.provider.count_data_tree() > 1 and not force_update: return self.provider.data_tree def make_node(data,parent_key): _category = dict(name=data['name'], category_code=data['category_code']) _category_key = self.provider.add_category(_category, parent_code=parent_key) if 'children' in data: for c in data['children']: make_node(c,_category_key) if 'datasets' in data: for d in data['datasets']: self.provider.add_dataset(dict(dataset_code = d['dataset_code'], name = d['name'], last_update = d['release_date'], metadata={'url': d['url'], 'doc_href': d['doc_href']}), _category_key) try: for data in parse_esri_site(): make_node(data, self.provider_name) except Exception as err: logger.error(err) raise def get_selected_datasets(self): """Collects the dataset codes that are in data_tree below the ones indicated in "selected_codes" provided in configuration :returns: list of dict of dataset settings""" category_filter = [".*%s.*" % d for d in self.selected_codes] category_filter = "|".join(category_filter) self.selected_datasets = {d['dataset_code']: d for d in self.datasets_list(category_filter=category_filter)} return self.selected_datasets # necessary for test mock def make_url(self): return self.dataset_settings['metadata']['url'] def upsert_dataset(self, dataset_code): """Updates data in Database for selected datasets :dset: dataset_code :returns: None""" self.get_selected_datasets() start = time.time() logger.info("upsert dataset[%s] - START" % (dataset_code)) self.dataset_settings = self.selected_datasets[dataset_code] url = self.make_url() dataset = Datasets(self.provider_name,dataset_code, fetcher=self) dataset.name = self.dataset_settings['name'] dataset.doc_href = self.dataset_settings['metadata']['doc_href'] dataset.last_update = self.dataset_settings['last_update'] data_iterator = EsriData(dataset,url,filename=dataset_code) dataset.series.data_iterator = data_iterator dataset.update_database() end = time.time() - start logger.info("upsert dataset[%s] - END - time[%.3f seconds]" % (dataset_code, end)) # TO BE FINISHED def parse_sna_agenda(self): #TODO: use Downloader download = Downloader(url="http://www.esri.cao.go.jp/en/sna/kouhyou/kouhyou_top.html", filename="agenda_sna.html") with open(download.get_filepath(), 'rb') as fp: agenda = lxml.html.parse(fp) # TO BE FINISHED def get_calendar(self): datasets = [d["dataset_code"] for d in self.datasets_list()] for entry in self.parse_agenda(): if entry['dataflow_key'] in datasets: yield {'action': 'update_node', 'kwargs': {'provider_name': self.provider_name, 'dataset_code': entry['dataflow_key']}, 'period_type': 'date', 'period_kwargs': {'run_date': datetime.strptime( entry['scheduled_date'], "%d/%m/%Y %H:%M CET"), 'timezone': pytz.timezone('Asia/Tokyo') } } # TODO: load earlier versions to get revisions def load_datasets_first(self): start = time.time() logger.info("datasets first load. provider[%s] - START" % (self.provider_name)) self.provider.update_database() self.build_data_tree() self.upsert_data_tree() datasets_list = [d for d in self.get_selected_datasets().keys()] for dataset_code in datasets_list: try: self.upsert_dataset(dataset_code) except Exception as err: logger.fatal("error for dataset[%s]: %s" % (dataset_code, str(err))) end = time.time() - start logger.info("datasets first load. provider[%s] - END - time[%.3f seconds]" % (self.provider_name, end)) def load_datasets_update(self): start = time.time() logger.info("datasets first load. provider[%s] - START" % (self.provider_name)) self.provider.update_database() self.upsert_data_tree() datasets_list = [d["dataset_code"] for d in self.datasets_list()] for dataset_code in datasets_list: try: self.upsert_dataset(dataset_code) except Exception as err: logger.fatal("error for dataset[%s]: %s" % (dataset_code, str(err))) end = time.time() - start logger.info("datasets first load. provider[%s] - END - time[%.3f seconds]" % (self.provider_name, end))
class ECB(Fetcher): def __init__(self, db=None, sdmx=None, **kwargs): super().__init__(provider_name='ECB', db=db, **kwargs) if not self.provider: self.provider = Providers(name=self.provider_name, long_name='European Central Bank', version=VERSION, region='Europe', website='http://www.ecb.europa.eu', fetcher=self) self.provider.update_database() if self.provider.version != VERSION: self.provider.update_database() self.sdmx = sdmx or ECBRequest(agency=self.provider_name) self.sdmx.timeout = 90 self._dataflows = None self._categoryschemes = None self._categorisations = None def _load_structure(self, force=False): """Load structure and build data_tree """ if (self._dataflows and self._categoryschemes and self._categorisations) and not force: return '''Force URL for select only ECB agency''' categoryschemes_response = self.sdmx.get(resource_type='categoryscheme', url='http://sdw-wsrest.ecb.int/service/categoryscheme/%s?references=parentsandsiblings' % self.provider_name) self._categorisations = categoryschemes_response.msg.categorisations self._categoryschemes = categoryschemes_response.msg.categoryschemes self._dataflows = categoryschemes_response.msg.dataflows def build_data_tree(self, force_update=False): """Build data_tree from structure datas """ if self.provider.count_data_tree() > 1 and not force_update: return self.provider.data_tree self._load_structure() for category in self._categoryschemes.aslist(): _category = dict(name=category.name.en, category_code=category.id) category_key = self.provider.add_category(_category) for subcategory in category.values(): if not subcategory.id in self._categorisations: continue _subcategory = dict(name=subcategory.name.en, category_code=subcategory.id) _subcategory_key = self.provider.add_category(_subcategory, parent_code=category_key) try: _categorisation = self._categorisations[subcategory.id] for i in _categorisation: _d = self._dataflows[i.artefact.id] self.provider.add_dataset(dict(dataset_code=_d.id, name=_d.name.en), _subcategory_key) except Exception as err: logger.error(err) raise return self.provider.data_tree def parse_agenda(self): #TODO: use Downloader download = Downloader(url="http://www.ecb.europa.eu/press/calendars/statscal/html/index.en.html", filename="statscall.html") with open(download.get_filepath(), 'rb') as fp: agenda = lxml.html.parse(fp) regex_date = re.compile("Reference period: (.*)") regex_dataset = re.compile(".*Dataset: (.*)\)") entries = agenda.xpath('//div[@class="ecb-faytdd"]/*/dt | ' '//div[@class="ecb-faytdd"]/*/dd')[2:] entries = zip(entries[::2], entries[1::2]) for entry in entries: item = {} match_key = regex_dataset.match(entry[1][0].text_content()) item['dataflow_key'] = match_key.groups()[0] match_date = regex_date.match(entry[1][1].text_content()) item['reference_period'] = match_date.groups()[0] item['scheduled_date'] = entry[0].text_content().replace('\n','') yield(item) def get_calendar(self): datasets = [d["dataset_code"] for d in self.datasets_list()] for entry in self.parse_agenda(): if entry['dataflow_key'] in datasets: yield {'action': 'update_node', 'kwargs': {'provider_name': self.provider_name, 'dataset_code': entry['dataflow_key']}, 'period_type': 'date', 'period_kwargs': {'run_date': datetime.strptime( entry['scheduled_date'], "%d/%m/%Y %H:%M CET"), 'timezone': pytz.timezone('CET') } } def upsert_dataset(self, dataset_code): start = time.time() logger.info("upsert dataset[%s] - START" % (dataset_code)) #TODO: control si existe ou update !!! dataset = Datasets(provider_name=self.provider_name, dataset_code=dataset_code, name=None, doc_href=self.provider.website, last_update=datetime.now(), fetcher=self) _data = ECB_Data(dataset=dataset) dataset.series.data_iterator = _data try: result = dataset.update_database() except: raise _data = None end = time.time() - start logger.info("upsert dataset[%s] - END - time[%.3f seconds]" % (dataset_code, end)) return result def load_datasets_first(self): start = time.time() logger.info("datasets first load. provider[%s] - START" % (self.provider_name)) self._load_structure() self.provider.update_database() self.upsert_data_tree() datasets_list = [d["dataset_code"] for d in self.datasets_list()] for dataset_code in datasets_list: try: self.upsert_dataset(dataset_code) except Exception as err: logger.fatal("error for dataset[%s]: %s" % (dataset_code, str(err))) end = time.time() - start logger.info("datasets first load. provider[%s] - END - time[%.3f seconds]" % (self.provider_name, end)) def load_datasets_update(self): #TODO: self.load_datasets_first()
class BIS(Fetcher): def __init__(self, db=None): super().__init__(provider_name="BIS", db=db) if not self.provider: self.provider = Providers( name=self.provider_name, long_name="Bank for International Settlements", version=VERSION, region="world", website="http://www.bis.org", fetcher=self, ) self.provider.update_database() if self.provider.version != VERSION: self.provider.update_database() def upsert_dataset(self, dataset_code): start = time.time() logger.info("upsert dataset[%s] - START" % (dataset_code)) if not DATASETS.get(dataset_code): raise Exception("This dataset is unknown" + dataset_code) dataset = Datasets( provider_name=self.provider_name, dataset_code=dataset_code, name=DATASETS[dataset_code]["name"], doc_href=DATASETS[dataset_code]["doc_href"], fetcher=self, ) fetcher_data = BIS_Data(dataset, url=DATASETS[dataset_code]["url"], filename=DATASETS[dataset_code]["filename"]) if fetcher_data.is_updated(): dataset.series.data_iterator = fetcher_data dataset.update_database() # TODO: clean datas (file temp) end = time.time() - start logger.info("upsert dataset[%s] - END-BEFORE-METAS - time[%.3f seconds]" % (dataset_code, end)) self.update_metas(dataset_code) end = time.time() - start logger.info("upsert dataset[%s] - END - time[%.3f seconds]" % (dataset_code, end)) else: logger.info( "upsert dataset[%s] bypass because is updated from release_date[%s]" % (dataset_code, fetcher_data.release_date) ) def load_datasets_first(self): start = time.time() logger.info("first load fetcher[%s] - START" % (self.provider_name)) for dataset_code in DATASETS.keys(): self.upsert_dataset(dataset_code) end = time.time() - start logger.info("first load fetcher[%s] - END - time[%.3f seconds]" % (self.provider_name, end)) def load_datasets_update(self): self.load_datasets_first() def build_data_tree(self, force_update=False): if self.provider.count_data_tree() > 1 and not force_update: return self.provider.data_tree for category_code, dataset in DATASETS.items(): category_key = self.provider.add_category( {"name": dataset["name"], "category_code": category_code, "doc_href": dataset["doc_href"]} ) _dataset = {"name": dataset["name"], "dataset_code": category_code} self.provider.add_dataset(_dataset, category_key) return self.provider.data_tree def parse_agenda(self): agenda = etree.HTML(get_agenda()) table = agenda.find(".//table") # only one table rows = table[0].findall("tr") # skipping first row cells = rows[1].findall("td") agenda = [] months = [None, None] for c in rows[1].iterfind("td"): content = c.find("strong") if content.text is None: content = content.find("strong") months.append(datetime.datetime.strptime(content.text, "%B %Y")) agenda.append(months) ir = 2 def get_links_text(cell): txt = [] for link in cell.findall("a"): if link.text: txt.append(link.text) return txt def _get_dates(cells): item = [] for ic, c in enumerate(cells): if c.text[0] != chr(160): item.append(re.match("\d\d|\d", c.text).group(0)) else: item.append(None) return item while ir < len(rows): cells = rows[ir].findall("td") content = cells[0] if content.text is None: content = content.find("a") item = [content.text] if cells[0].get("rowspan") == "2": two_rows = True content = cells[1].find("a") item.append(content.text) offset = 2 else: two_rows = False item.append(None) offset = 1 item.extend(_get_dates(cells[offset:])) agenda.append(item) ir += 1 if two_rows: cells = rows[ir].findall("td") links = get_links_text(cells[0]) for content in links: item = [item[0]] item.append(content) item.extend(_get_dates(cells[1:])) agenda.append(item) ir += 1 return agenda def get_calendar(self): agenda = self.parse_agenda() dataset_codes = [d["dataset_code"] for d in self.datasets_list()] """First line - exclude first 2 columns (title1, title2)""" months = agenda[0][2:] """All line moins first list""" periods = agenda[1:] def _get_dataset_code(title): for key, d in DATASETS.items(): if title in d.get("agenda_titles", []): return key return None for period in periods: title = period[0] if period[1]: title = "%s %s" % (title, period[1]) dataset_code = _get_dataset_code(title) if not dataset_code: logger.info("exclude calendar action for not implemented dataset[%s]" % title) continue if not dataset_code in dataset_codes: logger.info("exclude calendar action for dataset[%s]" % title) continue days = period[2:] scheds = [d for d in zip(months, days) if not d[1] is None] for date_base, day in scheds: yield { "action": "update_node", "kwargs": {"provider_name": self.provider_name, "dataset_code": dataset_code}, "period_type": "date", "period_kwargs": { "run_date": datetime.datetime(date_base.year, date_base.month, int(day), 8, 0, 0), "timezone": pytz.country_timezones(AGENDA["country"]), }, }
class INSEE(Fetcher): def __init__(self, db=None, sdmx=None, **kwargs): super().__init__(provider_name='INSEE', db=db, **kwargs) if not self.provider: self.provider = Providers(name=self.provider_name, long_name='National Institute of Statistics and Economic Studies', version=VERSION, region='France', website='http://www.insee.fr', fetcher=self) self.provider.update_database() if self.provider.version != VERSION: self.provider.update_database() self.sdmx = sdmx or Request(agency='INSEE') self._dataflows = None self._categoryschemes = None self._categorisations = None def _load_structure(self, force=False): if self._dataflows and not force: return """ #http://www.bdm.insee.fr/series/sdmx/categoryscheme categoryscheme_response = self.sdmx.get(resource_type='categoryscheme', params={"references": None}) logger.debug(categoryscheme_response.url) self._categoryschemes = categoryscheme_response.msg.categoryschemes #http://www.bdm.insee.fr/series/sdmx/categorisation categorisation_response = self.sdmx.get(resource_type='categorisation') logger.debug(categorisation_response.url) self._categorisations = categorisation_response.msg.categorisations """ #http://www.bdm.insee.fr/series/sdmx/dataflow dataflows_response = self.sdmx.get(resource_type='dataflow') logger.debug(dataflows_response.url) self._dataflows = dataflows_response.msg.dataflows def load_datasets_first(self): start = time.time() logger.info("datasets first load. provider[%s] - START" % (self.provider_name)) for dataset_code in self.datasets_list(): try: self.upsert_dataset(dataset_code) except Exception as err: logger.fatal("error for dataset[%s]: %s" % (dataset_code, str(err))) end = time.time() - start logger.info("update fetcher[%s] - END - time[%.3f seconds]" % (self.provider_name, end)) def load_datasets_update(self): #TODO: self.load_datasets_first() def build_data_tree(self, force_update=False): """Build data_tree from structure datas """ if self.provider.count_data_tree() > 1 and not force_update: return self.provider.data_tree self._load_structure() for dataset_code, dataset in self._dataflows.items(): name = dataset.name if "en" in dataset.name: name = dataset.name.en else: name = dataset.name.fr self.provider.add_dataset(dict(dataset_code=dataset_code, name=name), self.provider_name) return self.provider.data_tree for category in self._categoryschemes.aslist(): _category = dict(name=category.name.en, category_code=category.id) category_key = self.provider.add_category(_category) for subcategory in category.values(): if not subcategory.id in self._categorisations: continue _subcategory = dict(name=subcategory.name.en, category_code=subcategory.id) _subcategory_key = self.provider.add_category(_subcategory, parent_code=category_key) try: _categorisation = self._categorisations[subcategory.id] for i in _categorisation: _d = self._dataflows[i.artefact.id] self.provider.add_dataset(dict(dataset_code=_d.id, name=_d.name.en), _subcategory_key) except Exception as err: logger.error(err) raise return self.provider.data_tree def upsert_dataset(self, dataset_code): #self.load_structure(force=False) start = time.time() logger.info("upsert dataset[%s] - START" % (dataset_code)) #if not dataset_code in self._dataflows: # raise Exception("This dataset is unknown: %s" % dataset_code) #dataflow = self._dataflows[dataset_code] #cat = self.db[constants.COL_CATEGORIES].find_one({'category_code': dataset_code}) #dataset.name = cat['name'] #dataset.doc_href = cat['doc_href'] #dataset.last_update = cat['last_update'] dataset = Datasets(provider_name=self.provider_name, dataset_code=dataset_code, #name=dataflow.name.en, doc_href=None, last_update=datetime.now(), #TODO: fetcher=self) dataset_doc = self.db[constants.COL_DATASETS].find_one({'provider_name': self.provider_name, "dataset_code": dataset_code}) insee_data = INSEE_Data(dataset=dataset, dataset_doc=dataset_doc, #dataflow=dataflow, #sdmx=self.sdmx ) dataset.series.data_iterator = insee_data result = dataset.update_database() end = time.time() - start logger.info("upsert dataset[%s] - END - time[%.3f seconds]" % (dataset_code, end)) """ > IDBANK: A définir dynamiquement sur site ? doc_href d'une serie: http://www.bdm.insee.fr/bdm2/affichageSeries?idbank=001694226 > CODE GROUPE: Balance des Paiements mensuelle - Compte de capital http://www.bdm.insee.fr/bdm2/choixCriteres?codeGroupe=1556 """ return result
class Eurostat(Fetcher): """Class for managing the SDMX endpoint from eurostat in dlstats.""" def __init__(self, db=None): super().__init__(provider_name='Eurostat', db=db) if not self.provider: self.provider = Providers(name=self.provider_name, long_name='Eurostat', version=VERSION, region='Europe', website='http://ec.europa.eu/eurostat', fetcher=self) self.provider.update_database() if self.provider.version != VERSION: self.provider.update_database() self.selected_codes = [ 'nama_10', 'namq_10', 'nasa_10', 'nasq_10', 'naid_10', 'nama', 'namq', 'nasa', 'nasq', 'gov', 'ert', 'irt', 'prc', 'bop', 'bop_6', 'demo_pjanbroad', 'lfsi_act_q' ] self.selected_datasets = {} self.url_table_of_contents = "http://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?sort=1&file=table_of_contents.xml" self.dataset_url = None def build_data_tree(self, force_update=False): """Builds the data tree Pour créer les categories, ne prend que les branch dont l'un des <code> de la branch se trouvent dans selected_codes Même chose pour les datasets. Prend le category_code du parent et verifie si il est dans selected_codes """ start = time.time() logger.info("build_data_tree provider[%s] - START" % self.provider_name) if self.provider.count_data_tree() > 1 and not force_update: logger.info("use existing data-tree for provider[%s]" % self.provider_name) return self.provider.data_tree filepath = self.get_table_of_contents() it = etree.iterparse(filepath, events=['end']) def is_selected(parent_codes): """parent_codes is array of category_code """ for _select in self.selected_codes: if _select in parent_codes: return True return False for event, element in it: if event == 'end': if element.tag == fixtag_toc('nt', 'branch'): for child in element.iterchildren(tag=fixtag_toc('nt', 'children')): _parent_codes = xpath_parent_codes(child) _parents = xpath_ancestor_branch(child) if not is_selected(_parent_codes): continue for parent in _parents: _parent_code = xpath_code(parent)[0] _parent_title =xpath_title(parent)[0] '''Extrait la partie gauche des categories parents''' _parent_categories = ".".join(_parent_codes[:_parent_codes.index(_parent_code)]) _category = None _parent = None if not _parent_categories or len(_parent_categories) == 0: _category = {"category_code": _parent_code, "name": _parent_title} else: _parent = self.provider._category_key(_parent_categories) _category = {"category_code": _parent_code, "name": _parent_title} try: _key = self.provider.add_category(_category, _parent) except: #Pas de capture car verifie seulement si existe pass datasets = xpath_datasets(child) for dataset in datasets: parent_codes = xpath_parent_codes(dataset) dataset_code = xpath_code(dataset)[0] category_code = self.provider._category_key(".".join(parent_codes)) '''Verifie si au moins un des category_code est dans selected_codes''' if not is_selected(parent_codes): continue name = xpath_title(dataset)[0] last_update = xpath_ds_last_update(dataset) last_modified = xpath_ds_last_modified(dataset) doc_href = xpath_ds_metadata_html(dataset) data_start = xpath_ds_data_start(dataset) data_end = xpath_ds_data_end(dataset) values = xpath_ds_values(dataset) last_update = datetime.strptime(last_update[0], '%d.%m.%Y') if last_modified: last_modified = datetime.strptime(last_modified[0], '%d.%m.%Y') last_update = max(last_update, last_modified) dataset = { "dataset_code": dataset_code, "name": name, "last_update": last_update, "metadata": { "doc_href": first_element_xpath(doc_href), "data_start": first_element_xpath(data_start), "data_end": first_element_xpath(data_end), "values": int(first_element_xpath(values, default="0")), } } self.provider.add_dataset(dataset, category_code) dataset.clear() child.clear() element.clear() end = time.time() - start logger.info("build_data_tree load provider[%s] - END - time[%.3f seconds]" % (self.provider_name, end)) return self.provider.data_tree def get_table_of_contents(self): return Downloader(url=self.url_table_of_contents, filename="table_of_contents.xml").get_filepath() def get_selected_datasets(self): """Collects the dataset codes that are in table of contents below the ones indicated in "selected_codes" provided in configuration :returns: list of dict of dataset settings""" category_filter = [".*%s.*" % d for d in self.selected_codes] category_filter = "|".join(category_filter) self.selected_datasets = {d['dataset_code']: d for d in self.datasets_list(category_filter=category_filter)} return self.selected_datasets def upsert_dataset(self, dataset_code): """Updates data in Database for selected datasets :dset: dataset_code :returns: None""" self.get_selected_datasets() start = time.time() logger.info("upsert dataset[%s] - START" % (dataset_code)) dataset_settings = self.selected_datasets[dataset_code] dataset = Datasets(provider_name=self.provider_name, dataset_code=dataset_code, name=dataset_settings["name"], doc_href=dataset_settings["metadata"].get("doc_href"), last_update=dataset_settings["last_update"], fetcher=self) data_iterator = EurostatData(dataset, filename=dataset_code) dataset.series.data_iterator = data_iterator dataset.update_database() end = time.time() - start logger.info("upsert dataset[%s] - END - time[%.3f seconds]" % (dataset_code, end)) def load_datasets_first(self): self.get_selected_datasets() start = time.time() logger.info("first load provider[%s] - START" % (self.provider_name)) for dataset_code in self.selected_datasets.keys(): try: self.upsert_dataset(dataset_code) except Exception as err: logger.fatal("error for dataset[%s]: %s" % (dataset_code, str(err))) end = time.time() - start logger.info("first load provider[%s] - END - time[%.3f seconds]" % (self.provider_name, end)) def load_datasets_update(self): self.get_selected_datasets() start = time.time() logger.info("update provider[%s] - START" % (self.provider_name)) selected_datasets = self.db[constants.COL_DATASETS].find( {'provider_name': self.provider_name, 'dataset_code': {'$in': list(self.selected_datasets.keys())}}, {'dataset_code': 1, 'last_update': 1}) selected_datasets = {s['dataset_code'] : s for s in selected_datasets} for dataset_code, dataset in self.selected_datasets.items(): if (dataset_code not in selected_datasets) or (selected_datasets[dataset_code]['last_update'] < dataset['last_update']): try: self.upsert_dataset(dataset_code) except Exception as err: logger.fatal("error for dataset[%s]: %s" % (dataset_code, str(err))) end = time.time() - start logger.info("update provider[%s] - END - time[%.3f seconds]" % (self.provider_name, end))