Python Extractor示例，dracula.extractor.Extractor Python示例

示例#1

0

显示文件

文件： iorganiser.py 项目： metakermit/foc

 def __init__(self):
     '''
     Constructor
     '''
     self._extractor = Extractor()
     # will store a dictionary with the necessary data
     self.vis_data = None

示例#2

0

显示文件

文件： ivisualisation.py 项目： metakermit/foc

 def __init__(self):
     '''
     Constructor
     '''
     self._counter = 0
     self._got_items = False
     # initialize default configuration options
     
     #TODO: IVisualisation shouldn't use the extractor at all, but rely on a data organiser
     self._extractor = Extractor()
     if conf.cache_enabled:
         self._extractor.enable_cache(conf.cache_host, conf.cache_port)

示例#3

0

显示文件

 def __init__(self,
              look_back_years,
              cache_enabled=False,
              cache_host="localhost",
              cache_port=27017):
     '''
     Constructor
     '''
     self.t_loc = conf.sample_selection_file
     self.extractor = Extractor()
     self.cache_enabled = cache_enabled
     if self.cache_enabled:
         self.extractor.enable_cache(cache_host, cache_port)
     self.look_back_years = look_back_years
     self.preprocessor = Preprocessor()
     # sample set placeholders
     self.crisis_samples = []
     self.normal_samples = []
     self.metadata = Metadata(conf, look_back_years)

示例#4

0

显示文件

文件： iorganiser.py 项目： metakermit/foc

class IOrganiser(object):
    '''
    Fetches data using the Extractor and organises it
    in json files for an appropriate visualisation presenter.
    '''

    def __init__(self):
        '''
        Constructor
        '''
        self._extractor = Extractor()
        # will store a dictionary with the necessary data
        self.vis_data = None
    
    def _write_data(self):
        """
        @param vis_data: a file where data
        should be stored   
        """
        filename = "data.json"
        with open(filename, "w") as out_file:
            json_text = json.dumps(self.vis_data, indent=4)
            out_file.write(json_text)
    
    def _organise_data(self, conf):
        """
        should use the extractor to fetch what is needed
        and format it in a dictionary (store in self.vis_data).
        @attention: must override
        """
        raise MustOverrideError
    
    def get_representation(self, conf):
        if conf.cache_enabled:
            self._extractor.enable_cache(conf.cache_host, conf.cache_port)
        self._organise_data(conf)
        if conf.cache_enabled and self._extractor.was_cached():
            print("Cache was hit, didn't have to query the World Bank API.")
        elif conf.cache_enabled:
            print("Data wasn't cached, queried the World Bank API.")
        #self._write_data()
        return self.vis_data

示例#5

0

显示文件

文件： samples_set.py 项目： FOCproject/foc

 def __init__(self, look_back_years, cache_enabled = False, cache_host = "localhost", cache_port=27017):
     '''
     Constructor
     '''
     self.t_loc = conf.sample_selection_file
     self.extractor = Extractor()
     self.cache_enabled = cache_enabled
     if self.cache_enabled:
         self.extractor.enable_cache(cache_host, cache_port)
     self.look_back_years = look_back_years
     self.preprocessor = Preprocessor()
     # sample set placeholders
     self.crisis_samples = []
     self.normal_samples = []
     self.metadata = Metadata(conf, look_back_years)

示例#6

0

显示文件

    def test_extractor(self):
        extractor = Extractor()
        countries = extractor.grab()
        self.assertTrue(len(countries) > 0)

        arg = extractor.arg()
        arg["country_codes"] = ["usa", "hrv"]
        arg["indicator_codes"] = ["SP.POP.TOTL", "SL.TLF.PART.MA.ZS"]
        arg["interval"] = (2005, 2006)
        countries = extractor.grab(arg)
        self.assertTrue(len(countries) > 0)

        arg = extractor.arg()
        arg["country_codes"] = ["hrv"]
        arg["indicator_codes"] = ["SP.POP.TOTL"]
        arg["interval"] = (1998, 1999)
        countries = extractor.grab(arg)
        indicator = countries[0].get_indicator("SP.POP.TOTL")
        #print indicator.get_values()
        self.assertEqual(indicator.values, [4501000.0, 4554000.0])
        self.assertEqual(indicator.dates, [1998, 1999])

示例#7

0

显示文件

文件： extractor.py 项目： FOCproject/foc

    def test_extractor(self):
        extractor = Extractor()
        countries = extractor.grab()
        self.assertTrue(len(countries)>0)

        arg = extractor.arg()
        arg["country_codes"] = ["usa", "hrv"]
        arg["indicator_codes"] = ["SP.POP.TOTL", "SL.TLF.PART.MA.ZS"]
        arg["interval"] = (2005, 2006)
        countries = extractor.grab(arg)
        self.assertTrue(len(countries)>0)
        
        arg = extractor.arg()
        arg["country_codes"] = ["hrv"]
        arg["indicator_codes"] = ["SP.POP.TOTL"]
        arg["interval"] = (1998, 1999)
        countries = extractor.grab(arg)
        indicator = countries[0].get_indicator("SP.POP.TOTL")
        #print indicator.get_values()
        self.assertEqual(indicator.values, [4501000.0, 4554000.0])
        self.assertEqual(indicator.dates, [1998, 1999])

示例#8

0

显示文件

文件： extend_sgd_description.py 项目： FOCproject/foc

#!/usr/bin/python

import sys
import fileinput
from dracula.extractor import Extractor

if __name__=="__main__":
    try:
        desc_path = sys.argv[1]
    except:
        print("Usage:\npython extend_sgd_description.py <desired .sgd_description file>")
        exit()

    extractor = Extractor()
    country_names = {}
    countries = extractor.grab_metadata("countries")
    for country in countries:
        country_names[country.code]=country.name

    for line in fileinput.input(desc_path, inplace = 1):
        num_country, year = (line.rstrip().split("-"))
        number, country_code = num_country.split(" ")
        try:
            country_name = country_names[country_code]
        except:
            country_name = "????"
        sys.stdout.write("%s %s-%s-%s\n" % (number, country_code, year, country_name))

示例#9

0

显示文件

文件： ivisualisation.py 项目： metakermit/foc

class IVisualisation(object):
    '''
    Abstract visualisation,
    independent of concrete
    implementations.
    '''

    def __init__(self):
        '''
        Constructor
        '''
        self._counter = 0
        self._got_items = False
        # initialize default configuration options
        
        #TODO: IVisualisation shouldn't use the extractor at all, but rely on a data organiser
        self._extractor = Extractor()
        if conf.cache_enabled:
            self._extractor.enable_cache(conf.cache_host, conf.cache_port)
        
    def get_conf(self):
        return self.conf
        
    def _should_add_meta_marks(self):
        """
        Tells if additional graph stuff (axis labels, legend)
        should be placed 
        @return: Boolean
        """
        return not self.conf.combine_plots or self._counter == 0
    
    def get_title(self):
        if conf.auto_title:
            return self._auto_graph_title()
        else:
            return conf.graph_title
        
    def _auto_graph_title(self):
        if conf.combine_plots:
            country_representation = ", ".join([str(item).upper() for item in self._get_items()])
        else:
            country_representation = str(self._get_items()[self._counter]).upper()
        title = "%s - %s" % (country_representation, conf.title_end) 
        return title
        
    def _get_items(self):
        """
        Get items that form independent units of data for
        drawing. Normally these are countries, but this can
        be overriden. Each item then gets passed to the
        _create_figure function to draw them on a graph.
        @return: list of items
        """
        #TODO: this should just get a data list from the data organiser
        if not self._got_items:
            arg = self._extractor.arg()
            arg["country_codes"] = conf.countries
            arg["indicator_codes"] = conf.indicators
            arg["interval"] = (conf.start_date, conf.end_date)
            self.countries = self._extractor.grab(arg)
            #TODO: preprocessor
#            self._extractor.process(conf.process_indicators,
#                                   method = "slope",
#                                   look_back_years=conf.look_back_years)
            if conf.cache_enabled and self._extractor.was_cached():
                print("Cache was hit, didn't have to query the World Bank API.")
            elif conf.cache_enabled:
                print("Data wasn't cached, queried the World Bank API.")
            self._got_items = True
        return self.countries
    
    def _start_new_figure(self):
        """ a hook for doing pre-plot stuff """
        raise MustOverrideError
    
    def _finish_figure(self):
        """ a hook for doing post-plot stuff """
        raise MustOverrideError
    
    def _create_figure(self, item):
        """
        Create a figure and return it as a matplotlib object. Must override.
        """
        raise MustOverrideError
    
    def _show(self):
        """ a hook to actually show the graph (potentially blocking code)"""
        pass
    
    def show(self):
        if not conf.write_to_file: # interactive
            self._show()
    
    def create_all_figures(self, vis_data):
        """
        Write all figures to file(s) or plot in one or more windows.
        """
        # we create only one figure if this is a combo plot
        if conf.combine_plots:
            self._start_new_figure()
        # iterate through items (e.g. countries)
        #TODO: actually use vis_data here and
        # in the complete_multigroup_visualisation
        for item in self._get_items():
            if not conf.combine_plots:
                self._start_new_figure()
            self._create_figure(item)
            # this counter is important for subclasses. Be careful!
            self._counter += 1
            if not conf.combine_plots:
                self._finish_figure()
        # store the plots in case this is a combined plot 
        if conf.combine_plots:
            self._finish_figure()

示例#10

0

显示文件

class SamplesSet(object):
    '''
    Responsible for building train and test sets
    '''
    def __init__(self,
                 look_back_years,
                 cache_enabled=False,
                 cache_host="localhost",
                 cache_port=27017):
        '''
        Constructor
        '''
        self.t_loc = conf.sample_selection_file
        self.extractor = Extractor()
        self.cache_enabled = cache_enabled
        if self.cache_enabled:
            self.extractor.enable_cache(cache_host, cache_port)
        self.look_back_years = look_back_years
        self.preprocessor = Preprocessor()
        # sample set placeholders
        self.crisis_samples = []
        self.normal_samples = []
        self.metadata = Metadata(conf, look_back_years)

#    def buil_per_conf(self):
#        self.build_from_crises_file_from_crises_file(True)
#        pass

    def interesting_years_before(self, target_year):
        return range(target_year - self.look_back_years, target_year)

    def assign_samples(self,
                       indicators,
                       event_years,
                       event_class,
                       country_code="?"):
        # method creates machine learning samples from indicators
        # arguments:
        # event_years - years of crises or normal periods
        # (as specified in the sample selection file or in a rule)
        # classification - desired class corresponding to these years
        samples = []
        # select only interesting values from the indicator
        for event_year in event_years:
            interesting_years = self.interesting_years_before(event_year)
            try:
                features = []
                for indicator in indicators:
                    new_features = self.preprocessor.preprocess_indicator(
                        indicator, interesting_years)
                    features.extend(new_features)
                sample_description = country_code.upper() + "-" + str(
                    event_year)
                sample = Sample(features,
                                event_class,
                                description=sample_description)
                samples.append(sample)
            except NonExistentDataError:
                pass
        return samples

    def convert_to_boundaries(self, event_years, look_back_years):
        """
        convert a list of event years and look back years into
        a list of 2-tuples of boundaries (begin_year, end_year)
        """
        boundaries = []
        for event_year in event_years:
            boundaries.append((event_year - look_back_years, event_year - 1))
        return boundaries

    def events_to_boundaries(self, all_events, look_back_years):
        event_boundaries = {}
        for key, value in all_events.items():
            event_boundaries[key] = self.convert_to_boundaries(
                value, look_back_years)
        return event_boundaries

    def divide_single(self, samples, test_percentage):
        # divide a list of samples to train and test samples
        if test_percentage == 0:
            train_samples = samples
            test_samples = []
        else:
            number_test = int(len(samples) * test_percentage)
            test_samples = sample(samples, number_test)
            train_samples = list(set(samples).difference(set(test_samples)))
        return train_samples, test_samples

    def divide(self, crisis_samples, normal_samples, test_percentage):
        # same as divide_simple, only does that for both crisis and normal samples and combines them
        # into single train and test lists
        self.train_samples, self.test_samples = self.divide_single(
            crisis_samples, test_percentage)
        new_train_samples, new_test_samples = self.divide_single(
            normal_samples, test_percentage)
        self.train_samples.extend(new_train_samples)
        self.test_samples.extend(new_test_samples)
        return self.train_samples, self.test_samples

    def build_from_crises_file(self, country_codes, feature_indicators,
                               test_percentage):
        """
        Entry method that builds a samples set by fetching the data using the extractor.
        Classes are determined from a crisis XLS file.
        
        sparse - if True it fetches the data for the necessary years only. Shown to be non-efficient.
        """
        # clear the sample sets
        self.crisis_samples = []
        self.normal_samples = []
        # get the years classified as crises / normal periods
        dates_input = Input()
        t_crises, t_normal = dates_input.parse_sample_selection(self.t_loc)
        crises_list, normal_list = dates_input.parse_sample_selection_to_list(
            self.t_loc)

        if country_codes[
                0] == "EVERYTHING":  # we take everything available in the samples set
            wb_countries = self.extractor.grab_metadata("countries")
            wb_country_codes = set([country.code for country in wb_countries])
            samples_definition_codes = set(t_crises.keys()) | set(
                t_normal.keys())
            country_codes = list(wb_country_codes & samples_definition_codes)
            country_codes.sort()

        # we fetch all the data here
        # boundaries
        start_date = min(min(crises_list),
                         min(normal_list)) - conf.look_back_years
        end_date = max(max(crises_list), max(normal_list))
        arg = self.extractor.arg()
        arg["country_codes"] = country_codes
        arg["indicator_codes"] = feature_indicators
        arg["interval"] = (start_date, end_date)
        arg["pause"] = conf.wb_pause
        countries = self.extractor.grab(arg)
        if self.cache_enabled and self.extractor.was_cached():
            print("Cache was hit, didn't have to query the World Bank API.")
        elif self.cache_enabled:
            print("Data wasn't cached, queried the World Bank API.")

        # assign the samples
        for country in countries:
            # fetch all the indicators for target country
            indicators = []
            for ind_code in feature_indicators:
                indicator = country.get_indicator(ind_code)
                indicators.append(indicator)
            # create samples from those indicators - in crises...
            try:
                crisis_years = t_crises[country.code]
            except KeyError:
                continue  # we skip this country
            new_samples = self.assign_samples(indicators, crisis_years,
                                              CRISIS_CLASS, country.code)
            self.crisis_samples.extend(new_samples)
            # ... and in normal periods
            normal_years = t_normal[country.code]
            new_samples = self.assign_samples(indicators, normal_years,
                                              NORMAL_CLASS, country.code)
            self.normal_samples.extend(new_samples)
        return self.divide(self.crisis_samples, self.normal_samples,
                           test_percentage)

    def build_by_condition(self, country_codes, indicators, feature_indicators,
                           test_percentage):
        # determine crises according to some condition/rule
        raise NotImplemented

示例#11

0

显示文件

文件： imf_parser.py 项目： metakermit/foc

# <nbformat>3.0</nbformat>

# <markdowncell>

# Reading data
# ===========
# In this part we'll get the crisis years from a published IMF data set.
#
# First let's get the standard country codes coresponding to these countries using dracula.

# <codecell>

import inspect
from dracula.extractor import Extractor
import dracula
extractor = Extractor()
country_codes = {}
countries = extractor.grab_metadata("countries")
print(inspect.getsourcelines(dracula.wb.parser.parse_multiple_countries_alone))
for country in countries:
    #print(dir(country))
    country_codes[country.name] = country.code
print(country_codes)

# <markdowncell>

# Manual fixing

# <codecell>

country_codes["Serbia, Republic of"] = 'SRB'

示例#12

0

显示文件

文件： extractor.py 项目： FOCproject/foc

 def test_cache(self):
     #host = "localhost"
     host = "lis.irb.hr"
     extractor = Extractor()
     # enable cache, but use a test DB
     extractor.enable_cache(host, 27017, test=True)
     extractor.clear_cache()
     # grab some data
     arg = extractor.arg()
     arg["country_codes"] = ["hrv", "usa"]
     arg["interval"] = (1997, 1999)
     arg["indicator_codes"] = ["SP.POP.TOTL"]
     countries = extractor.grab(arg)
     # see if it's cached
     self.assertEqual(extractor.is_cached(arg), True,
                      "Countries must be cached after grab")
     arg["country_codes"].append("fin") 
     self.assertEqual(extractor.is_cached(arg), False,
                      "Countries must match to give a cache hit")
     arg["country_codes"]= ["hrv", "usa"]
     arg["interval"] = (1996, 1999)
     self.assertEqual(extractor.is_cached(arg), False,
                      "Years must match to give a cache hit")
     arg["interval"] = (1997, 1999)
     arg["indicator_codes"].append("FR.INR.RINR")
     self.assertEqual(extractor.is_cached(arg), False,
                      "Indicators must match to give a cache hit")
     # grab some more data and see if there are duplicate countries
     countries = extractor.grab(arg)
     country_count = len([c for c in extractor._cacher.db.countries.find()])
     self.assertEqual(country_count, 2,
                      "Grabing a wider set must not leave duplicates!")

示例#13

0

显示文件

 def test_cache(self):
     #host = "localhost"
     host = "lis.irb.hr"
     extractor = Extractor()
     # enable cache, but use a test DB
     extractor.enable_cache(host, 27017, test=True)
     extractor.clear_cache()
     # grab some data
     arg = extractor.arg()
     arg["country_codes"] = ["hrv", "usa"]
     arg["interval"] = (1997, 1999)
     arg["indicator_codes"] = ["SP.POP.TOTL"]
     countries = extractor.grab(arg)
     # see if it's cached
     self.assertEqual(extractor.is_cached(arg), True,
                      "Countries must be cached after grab")
     arg["country_codes"].append("fin")
     self.assertEqual(extractor.is_cached(arg), False,
                      "Countries must match to give a cache hit")
     arg["country_codes"] = ["hrv", "usa"]
     arg["interval"] = (1996, 1999)
     self.assertEqual(extractor.is_cached(arg), False,
                      "Years must match to give a cache hit")
     arg["interval"] = (1997, 1999)
     arg["indicator_codes"].append("FR.INR.RINR")
     self.assertEqual(extractor.is_cached(arg), False,
                      "Indicators must match to give a cache hit")
     # grab some more data and see if there are duplicate countries
     countries = extractor.grab(arg)
     country_count = len([c for c in extractor._cacher.db.countries.find()])
     self.assertEqual(country_count, 2,
                      "Grabing a wider set must not leave duplicates!")

示例#14

0

显示文件

文件： samples_set.py 项目： FOCproject/foc

class SamplesSet(object):
    '''
    Responsible for building train and test sets
    '''


    def __init__(self, look_back_years, cache_enabled = False, cache_host = "localhost", cache_port=27017):
        '''
        Constructor
        '''
        self.t_loc = conf.sample_selection_file
        self.extractor = Extractor()
        self.cache_enabled = cache_enabled
        if self.cache_enabled:
            self.extractor.enable_cache(cache_host, cache_port)
        self.look_back_years = look_back_years
        self.preprocessor = Preprocessor()
        # sample set placeholders
        self.crisis_samples = []
        self.normal_samples = []
        self.metadata = Metadata(conf, look_back_years)
    
#    def buil_per_conf(self):
#        self.build_from_crises_file_from_crises_file(True)
#        pass
       
    def interesting_years_before(self, target_year):
        return range(target_year-self.look_back_years, target_year)
    
    def assign_samples(self, indicators, event_years, event_class, country_code="?"):
        # method creates machine learning samples from indicators
        # arguments:
        # event_years - years of crises or normal periods
        # (as specified in the sample selection file or in a rule)
        # classification - desired class corresponding to these years
        samples = []
        # select only interesting values from the indicator
        for event_year in event_years:
            interesting_years = self.interesting_years_before(event_year)
            try:
                features = []
                for indicator in indicators:
                    new_features = self.preprocessor.preprocess_indicator(indicator,
                                                                          interesting_years)
                    features.extend(new_features)
                sample_description = country_code.upper() + "-" + str(event_year)
                sample = Sample(features, event_class,
                                description=sample_description)
                samples.append(sample)
            except NonExistentDataError:
                pass
        return samples
    
    def convert_to_boundaries(self, event_years, look_back_years):
        """
        convert a list of event years and look back years into
        a list of 2-tuples of boundaries (begin_year, end_year)
        """
        boundaries = []
        for event_year in event_years:
            boundaries.append((event_year-look_back_years, event_year-1))
        return boundaries
    
    def events_to_boundaries(self, all_events, look_back_years):
        event_boundaries = {}
        for key, value in all_events.items():
            event_boundaries[key] = self.convert_to_boundaries(value, look_back_years)
        return event_boundaries
            
    def divide_single(self, samples, test_percentage):
        # divide a list of samples to train and test samples
        if test_percentage==0:
            train_samples = samples
            test_samples = []
        else:
            number_test =int(len(samples)*test_percentage)
            test_samples = sample(samples, number_test)
            train_samples = list(set(samples).difference(set(test_samples)))
        return train_samples, test_samples 
        
    def divide(self, crisis_samples, normal_samples, test_percentage):
        # same as divide_simple, only does that for both crisis and normal samples and combines them
        # into single train and test lists
        self.train_samples, self.test_samples = self.divide_single(crisis_samples, test_percentage)
        new_train_samples, new_test_samples = self.divide_single(normal_samples, test_percentage)
        self.train_samples.extend(new_train_samples)
        self.test_samples.extend(new_test_samples)
        return self.train_samples, self.test_samples
        
    
    def build_from_crises_file(self, country_codes, feature_indicators, test_percentage):
        """
        Entry method that builds a samples set by fetching the data using the extractor.
        Classes are determined from a crisis XLS file.
        
        sparse - if True it fetches the data for the necessary years only. Shown to be non-efficient.
        """
        # clear the sample sets
        self.crisis_samples = []
        self.normal_samples = []
        # get the years classified as crises / normal periods
        dates_input= Input()
        t_crises, t_normal = dates_input.parse_sample_selection(self.t_loc)
        crises_list, normal_list = dates_input.parse_sample_selection_to_list(self.t_loc)
        
        if country_codes[0]=="EVERYTHING": # we take everything available in the samples set
            wb_countries = self.extractor.grab_metadata("countries")
            wb_country_codes = set([country.code for country in wb_countries])
            samples_definition_codes = set(t_crises.keys()) | set(t_normal.keys())
            country_codes = list(wb_country_codes & samples_definition_codes)
            country_codes.sort()
        
        # we fetch all the data here
        # boundaries
        start_date = min(min(crises_list), min(normal_list))-conf.look_back_years
        end_date = max(max(crises_list), max(normal_list))
        arg = self.extractor.arg()
        arg["country_codes"] = country_codes
        arg["indicator_codes"] = feature_indicators
        arg["interval"] = (start_date, end_date)
        arg["pause"] = conf.wb_pause
        countries = self.extractor.grab(arg)
        if self.cache_enabled and self.extractor.was_cached():
            print("Cache was hit, didn't have to query the World Bank API.")
        elif self.cache_enabled:
            print("Data wasn't cached, queried the World Bank API.")
        
        # assign the samples
        for country in countries:
            # fetch all the indicators for target country
            indicators = []
            for ind_code in feature_indicators:
                indicator = country.get_indicator(ind_code)
                indicators.append(indicator)
            # create samples from those indicators - in crises...
            try:
                crisis_years = t_crises[country.code]
            except KeyError:
                continue # we skip this country
            new_samples = self.assign_samples(indicators,
                                              crisis_years,
                                              CRISIS_CLASS,
                                              country.code)
            self.crisis_samples.extend(new_samples)
            # ... and in normal periods
            normal_years = t_normal[country.code]
            new_samples = self.assign_samples(indicators,
                                              normal_years,
                                              NORMAL_CLASS,
                                              country.code)
            self.normal_samples.extend(new_samples)
        return self.divide(self.crisis_samples, self.normal_samples, test_percentage)
            
    def build_by_condition(self, country_codes, indicators, feature_indicators, test_percentage):
        # determine crises according to some condition/rule
        raise NotImplemented