class WorldBank(Fetcher): def __init__(self, db=None): super().__init__(provider_name='WorldBank', db=db) self.provider = Providers(name=self.provider_name, long_name='World Bank', version=VERSION, region='world', website='http://www.worldbank.org/', fetcher=self) def upsert_categories(self): data_tree = {'name': 'World Bank', 'category_code': 'worldbank_root', 'children': [{'name': 'GEM' , 'category_code': 'GEM', 'exposed': True, 'children': []}]} self.provider.add_data_tree(data_tree) def upsert_dataset(self, dataset_code): start = time.time() logger.info("upsert dataset[%s] - START" % (dataset_code)) #TODO return the _id field of the corresponding dataset. Update the category accordingly. if dataset_code=='GEM': self.upsert_gem(dataset_code) else: raise Exception("This dataset is unknown" + dataCode) self.update_metas(dataset_code) end = time.time() - start logger.info("upsert dataset[%s] - END - time[%.3f seconds]" % (dataset_code, end)) def upsert_gem(self, dataset_code): d = DATASETS[dataset_code] url = d['url'] dataset = Datasets(provider_name=self.provider_name, dataset_code=dataset_code, name=d['name'], doc_href=d['doc_href'], fetcher=self) gem_data = GemData(dataset, url) dataset.last_update = gem_data.release_date dataset.series.data_iterator = gem_data dataset.update_database() def upsert_all_datasets(self): start = time.time() logger.info("update fetcher[%s] - START" % (self.provider_name)) self.upsert_dataset('GEM') end = time.time() - start logger.info("update fetcher[%s] - END - time[%.3f seconds]" % (self.provider_name, end)) def datasets_list(self): return DATASETS.keys() def datasets_long_list(self): return [(key, dataset['name']) for key, dataset in DATASETS.items()] def download(self, dataset_code=None, url=None): filepath_dir = os.path.abspath(os.path.join(tempfile.gettempdir(), self.provider_name)) filepath = "%s.zip" % os.path.abspath(os.path.join(filepath_dir, dataset_code)) if not os.path.exists(filepath_dir): os.makedirs(filepath_dir, exist_ok=True) if os.path.exists(filepath): os.remove(filepath) if logger.isEnabledFor(logging.INFO): logger.info("store file to [%s]" % filepath) start = time.time() try: response = requests.get(url, #TODO: timeout=self.timeout, stream=True, allow_redirects=True, verify=False) if not response.ok: msg = "download url[%s] - status_code[%s] - reason[%s]" % (url, response.status_code, response.reason) logger.error(msg) raise Exception(msg) with open(filepath,'wb') as f: for chunk in response.iter_content(): f.write(chunk) return response.headers['Last-Modified'], filepath except requests.exceptions.ConnectionError as err: raise Exception("Connection Error") except requests.exceptions.ConnectTimeout as err: raise Exception("Connect Timeout") except requests.exceptions.ReadTimeout as err: raise Exception("Read Timeout") except Exception as err: raise Exception("Not captured exception : %s" % str(err)) end = time.time() - start logger.info("download file[%s] - END - time[%.3f seconds]" % (url, end))
class IMF(Fetcher): def __init__(self, db=None, **kwargs): super().__init__(provider_name='IMF', db=db, **kwargs) self.provider = Providers(name=self.provider_name, long_name="International Monetary Fund", version=VERSION, region='world', website='http://www.imf.org/', fetcher=self) def upsert_all_datasets(self): start = time.time() logger.info("update fetcher[%s] - START" % (self.provider_name)) for dataset_code in DATASETS.keys(): self.upsert_dataset(dataset_code) end = time.time() - start logger.info("update fetcher[%s] - END - time[%.3f seconds]" % (self.provider_name, end)) def upsert_dataset(self, dataset_code): start = time.time() logger.info("upsert dataset[%s] - START" % (dataset_code)) if dataset_code=='WEO': for u in self.weo_urls: self.upsert_weo_issue(u, dataset_code) else: raise Exception("This dataset is unknown" + dataset_code) end = time.time() - start logger.info("upsert dataset[%s] - END - time[%.3f seconds]" % (dataset_code, end)) def datasets_list(self): return DATASETS.keys() def datasets_long_list(self): return [(key, dataset['name']) for key, dataset in DATASETS.items()] @property def weo_urls(self): webpage = requests.get('http://www.imf.org/external/ns/cs.aspx?id=28') #TODO: replace by beautifoulsoup ? html = etree.HTML(webpage.text) hrefs = html.xpath("//div[@id = 'content-main']/h4/a['href']") links = [href.values() for href in hrefs] #The last links of the WEO webpage lead to data we dont want to pull. links = links[:-16] #These are other links we don't want. links.pop(-8) links.pop(-10) links = [link[0][:-10]+'download.aspx' for link in links] output = [] for link in links: webpage = requests.get(link) html = etree.HTML(webpage.text) final_link = html.xpath("//div[@id = 'content']//table//a['href']") final_link = final_link[0].values() output.append(link[:-13]+final_link[0]) # we need to handle the issue in chronological order return(sorted(output)) def upsert_weo_issue(self, url, dataset_code): settings = DATASETS[dataset_code] dataset = Datasets(provider_name=self.provider_name, dataset_code=dataset_code, name=settings['name'], doc_href=settings['doc_href'], fetcher=self) weo_data = WeoData(dataset, url) dataset.last_update = weo_data.release_date dataset.attribute_list.update_entry('flags','e','Estimated') dataset.series.data_iterator = weo_data try: dataset.update_database() self.update_metas(dataset_code) except Exception as err: logger.error(str(err)) def upsert_categories(self): data_tree = {'name': 'IMF', 'category_code': 'imf_root', 'children': [{'name': 'WEO' , 'category_code': 'WEO', 'exposed': True, 'children': []}]} self.provider.add_data_tree(data_tree)
class OECD(Fetcher): def __init__(self, db=None, **kwargs): super().__init__(provider_name='OECD', db=db, **kwargs) self.provider_name = 'OECD' self.provider = Providers(name=self.provider_name, long_name='Organisation for Economic Co-operation and Development', version=VERSION, region='world', website='http://www.oecd.org', fetcher=self) def upsert_dataset(self, dataset_code, datas=None): start = time.time() logger.info("upsert dataset[%s] - START" % (dataset_code)) if not DATASETS.get(dataset_code): raise Exception("This dataset is unknown" + dataset_code) dataset = Datasets(provider_name=self.provider_name, dataset_code=dataset_code, name=DATASETS[dataset_code]['name'], doc_href=DATASETS[dataset_code]['doc_href'], fetcher=self) fetcher_data = OECD_Data(dataset) dataset.series.data_iterator = fetcher_data dataset.update_database() end = time.time() - start logger.info("upsert dataset[%s] - END-BEFORE-METAS - time[%.3f seconds]" % (dataset_code, end)) self.update_metas(dataset_code) end = time.time() - start logger.info("upsert dataset[%s] - END - time[%.3f seconds]" % (dataset_code, end)) def datasets_list(self): return DATASETS.keys() def datasets_long_list(self): return [(key, dataset['name']) for key, dataset in DATASETS.items()] def upsert_all_datasets(self): start = time.time() logger.info("update fetcher[%s] - START" % (self.provider_name)) for dataset_code in DATASETS.keys(): self.upsert_dataset(dataset_code) end = time.time() - start logger.info("update fetcher[%s] - END - time[%.3f seconds]" % (self.provider_name, end)) def upsert_categories(self): data_tree = {'name': 'OECD', 'category_code': 'oecd_root', 'children': []} for dataset_code in DATASETS.keys(): data_tree['children'].append({'name': DATASETS[dataset_code]['name'], 'category_code': dataset_code, 'exposed': True, 'children': None}) self.provider.add_data_tree(data_tree)