예제 #1
0
class WorldBank(Fetcher):

    def __init__(self, db=None):
        
        super().__init__(provider_name='WorldBank',  db=db)         
        
        self.provider = Providers(name=self.provider_name,
                                 long_name='World Bank',
                                 version=VERSION,
                                 region='world',
                                 website='http://www.worldbank.org/',
                                 fetcher=self)
       
    def upsert_categories(self):
        data_tree = {'name': 'World Bank',
                     'category_code': 'worldbank_root',
                     'children': [{'name': 'GEM' , 
                                   'category_code': 'GEM',
                                   'exposed': True,
                                   'children': []}]}
        self.provider.add_data_tree(data_tree)

    def upsert_dataset(self, dataset_code):
        start = time.time()
        logger.info("upsert dataset[%s] - START" % (dataset_code))
        #TODO return the _id field of the corresponding dataset. Update the category accordingly.
        if dataset_code=='GEM':
            self.upsert_gem(dataset_code)
        else:
            raise Exception("This dataset is unknown" + dataCode)                 
        self.update_metas(dataset_code)        
        end = time.time() - start
        logger.info("upsert dataset[%s] - END - time[%.3f seconds]" % (dataset_code, end))

    def upsert_gem(self, dataset_code):
        d = DATASETS[dataset_code]
        url = d['url']
        dataset = Datasets(provider_name=self.provider_name, 
                           dataset_code=dataset_code, 
                           name=d['name'], 
                           doc_href=d['doc_href'], 
                           fetcher=self)
        gem_data = GemData(dataset, url)
        dataset.last_update = gem_data.release_date
        dataset.series.data_iterator = gem_data
        dataset.update_database()
        
    def upsert_all_datasets(self):
        start = time.time()
        logger.info("update fetcher[%s] - START" % (self.provider_name))
        self.upsert_dataset('GEM')  
        end = time.time() - start
        logger.info("update fetcher[%s] - END - time[%.3f seconds]" % (self.provider_name, end))

    def datasets_list(self):
        return DATASETS.keys()

    def datasets_long_list(self):
        return [(key, dataset['name']) for key, dataset in DATASETS.items()]

    def download(self, dataset_code=None, url=None):

        filepath_dir = os.path.abspath(os.path.join(tempfile.gettempdir(), 
                                        self.provider_name))
        
        filepath = "%s.zip" % os.path.abspath(os.path.join(filepath_dir, dataset_code))

        if not os.path.exists(filepath_dir):
            os.makedirs(filepath_dir, exist_ok=True)
            
        if os.path.exists(filepath):
            os.remove(filepath)
            
        if logger.isEnabledFor(logging.INFO):
            logger.info("store file to [%s]" % filepath)

        start = time.time()
        try:
            response = requests.get(url, 
                                    #TODO: timeout=self.timeout, 
                                    stream=True,
                                    allow_redirects=True,
                                    verify=False)

            if not response.ok:
                msg = "download url[%s] - status_code[%s] - reason[%s]" % (url, 
                                                                           response.status_code, 
                                                                           response.reason)
                logger.error(msg)
                raise Exception(msg)
            
            with open(filepath,'wb') as f:
                for chunk in response.iter_content():
                    f.write(chunk)

            return response.headers['Last-Modified'], filepath
                
        except requests.exceptions.ConnectionError as err:
            raise Exception("Connection Error")
        except requests.exceptions.ConnectTimeout as err:
            raise Exception("Connect Timeout")
        except requests.exceptions.ReadTimeout as err:
            raise Exception("Read Timeout")
        except Exception as err:
            raise Exception("Not captured exception : %s" % str(err))            

        end = time.time() - start
        logger.info("download file[%s] - END - time[%.3f seconds]" % (url, end))
예제 #2
0
class IMF(Fetcher):

    def __init__(self, db=None, **kwargs):        
        super().__init__(provider_name='IMF', db=db, **kwargs)
        
        self.provider = Providers(name=self.provider_name, 
                                  long_name="International Monetary Fund",
                                  version=VERSION, 
                                  region='world', 
                                  website='http://www.imf.org/', 
                                  fetcher=self)

    def upsert_all_datasets(self):
        start = time.time()
        logger.info("update fetcher[%s] - START" % (self.provider_name))
        
        for dataset_code in DATASETS.keys():
            self.upsert_dataset(dataset_code) 

        end = time.time() - start
        logger.info("update fetcher[%s] - END - time[%.3f seconds]" % (self.provider_name, end))
        
    def upsert_dataset(self, dataset_code):
        start = time.time()
        logger.info("upsert dataset[%s] - START" % (dataset_code))
        
        if dataset_code=='WEO':
            for u in self.weo_urls:
                self.upsert_weo_issue(u, dataset_code)
        else:
            raise Exception("This dataset is unknown" + dataset_code)
        
        end = time.time() - start
        logger.info("upsert dataset[%s] - END - time[%.3f seconds]" % (dataset_code, end))

    def datasets_list(self):
        return DATASETS.keys()

    def datasets_long_list(self):
        return [(key, dataset['name']) for key, dataset in DATASETS.items()]

    @property
    def weo_urls(self):

        webpage = requests.get('http://www.imf.org/external/ns/cs.aspx?id=28')
        
        #TODO: replace by beautifoulsoup ?
        html = etree.HTML(webpage.text)
        hrefs = html.xpath("//div[@id = 'content-main']/h4/a['href']")
        links = [href.values() for href in hrefs]
        
        #The last links of the WEO webpage lead to data we dont want to pull.
        links = links[:-16]
        #These are other links we don't want.
        links.pop(-8)
        links.pop(-10)
        links = [link[0][:-10]+'download.aspx' for link in links]
        
        output = []

        for link in links:
            webpage = requests.get(link)
            html = etree.HTML(webpage.text)
            final_link = html.xpath("//div[@id = 'content']//table//a['href']")
            final_link = final_link[0].values()
            output.append(link[:-13]+final_link[0])

        # we need to handle the issue in chronological order
        return(sorted(output))
        
    def upsert_weo_issue(self, url, dataset_code):
        
        settings = DATASETS[dataset_code]
        
        dataset = Datasets(provider_name=self.provider_name, 
                           dataset_code=dataset_code, 
                           name=settings['name'], 
                           doc_href=settings['doc_href'], 
                           fetcher=self)
        
        weo_data = WeoData(dataset, url)
        dataset.last_update = weo_data.release_date        
        dataset.attribute_list.update_entry('flags','e','Estimated')
        dataset.series.data_iterator = weo_data
        try:
            dataset.update_database()
            self.update_metas(dataset_code)
        except Exception as err:
            logger.error(str(err))

    def upsert_categories(self):
        data_tree = {'name': 'IMF',
                     'category_code': 'imf_root',
                     'children': [{'name': 'WEO' , 
                                   'category_code': 'WEO',
                                   'exposed': True,
                                   'children': []}]}
        self.provider.add_data_tree(data_tree)
예제 #3
0
class OECD(Fetcher):
    
    def __init__(self, db=None, **kwargs):
        super().__init__(provider_name='OECD', db=db, **kwargs)
        self.provider_name = 'OECD'
        self.provider = Providers(name=self.provider_name, 
                                  long_name='Organisation for Economic Co-operation and Development',
                                  version=VERSION,
                                  region='world',
                                  website='http://www.oecd.org', 
                                  fetcher=self)

    def upsert_dataset(self, dataset_code, datas=None):
        
        start = time.time()
        
        logger.info("upsert dataset[%s] - START" % (dataset_code))
        
        if not DATASETS.get(dataset_code):
            raise Exception("This dataset is unknown" + dataset_code)
        
        dataset = Datasets(provider_name=self.provider_name, 
                           dataset_code=dataset_code, 
                           name=DATASETS[dataset_code]['name'], 
                           doc_href=DATASETS[dataset_code]['doc_href'],
                           fetcher=self)
        
        fetcher_data = OECD_Data(dataset)
        dataset.series.data_iterator = fetcher_data
        dataset.update_database()

        end = time.time() - start
        logger.info("upsert dataset[%s] - END-BEFORE-METAS - time[%.3f seconds]" % (dataset_code, end))

        self.update_metas(dataset_code)
        
        end = time.time() - start
        logger.info("upsert dataset[%s] - END - time[%.3f seconds]" % (dataset_code, end))

    def datasets_list(self):
        return DATASETS.keys()

    def datasets_long_list(self):
        return [(key, dataset['name']) for key, dataset in DATASETS.items()]

    def upsert_all_datasets(self):
        start = time.time()
        logger.info("update fetcher[%s] - START" % (self.provider_name))
        
        for dataset_code in DATASETS.keys():
            self.upsert_dataset(dataset_code) 
        end = time.time() - start
        logger.info("update fetcher[%s] - END - time[%.3f seconds]" % (self.provider_name, end))
        
    def upsert_categories(self):
        
        data_tree = {'name': 'OECD',
                     'category_code': 'oecd_root',
                     'children': []}
        
        for dataset_code in DATASETS.keys():
            data_tree['children'].append({'name': DATASETS[dataset_code]['name'], 
                                          'category_code': dataset_code,
                                          'exposed': True,
                                          'children': None})

        self.provider.add_data_tree(data_tree)