예제 #1
0
    def load_config(self, config=None, **kwargs):
        """Try to load a config file and handle when its not available

        :param config: config file or :class:`metriqueu.jsonconf.JSONConf`
        :param kwargs: additional config key:value pairs to store
        """
        if type(config) is type(Config):
            self._config_file = config.config_file
        else:
            self._config_file = config or self._config_file
            self.config = Config(config_file=self._config_file)
        self.config.update(kwargs)
예제 #2
0
class BaseClient(object):
    """
    Low level client API which provides baseline functionality, including
    methods for loading data from csv and json, loading metrique client
    cubes, config file loading and logging setup.

    Essentially, cubes are data made from a list of dicts.

    All objects are expected to contain a `_oid` key value property. This
    property should be unique per individual "object" defined.

    For example, if we are storing logs, we might consider each log line a
    separate "object" since those log lines should never change in the future
    and give each a unique `_oid`. Or if we are storing data about
    'meta objects' of some sort, say 'github repo issues' for example, we
    might have objects with _oids of
    `%(username)s_%(reponame)s_%(issuenumber)s`.

    Optionally, objects can contain the following additional meta-properties:
        * _start - datetime when the object state was set
        * _end - datetime when the object state changed to a new state

    Field names (object dict keys) must consist of alphanumeric and underscore
    characters only.

    Field names are partially normalized automatically:
        * non-alphanumeric characters are removed
        * spaces converted to underscores
        * letters are lowercased

    Property values are normalized to some extent automatically as well:
        * empty strings -> None

    Additionally, some common operation methods are provided for
    operations such as loading a HTTP uri and determining currently
    configured username.

    :cvar name: name of the cube
    :cvar defaults: cube default property container (cube specific meta-data)
    :cvar fields: cube fields definitions
    :cvar saveas: filename to use when saving cube data to disk locally
    :cvar config: local cube config object

    If cube is specified as a kwarg upon initialization, the specific cube
    class will be located and returned, assuming its available in sys.path.

    If the cube fails to import, RuntimeError will be raised.

    Example usage::

        >>> import pyclient
        >>> c = pyclient(cube='git_commit')
            <type HTTPClient(...)>

        >>> z = pyclient()
        >>> z.get_cube(cube='git_commit')
            <type HTTPClient(...)>

    """

    name = None
    defaults = None
    fields = None
    saveas = ""
    _cache = None
    config = None

    def __new__(cls, *args, **kwargs):
        if "cube" in kwargs and kwargs["cube"]:
            cls = get_cube(cube=kwargs["cube"], init=False)
        else:
            cls = cls
        return object.__new__(cls)

    def __init__(self, config_file=None, name=None, **kwargs):
        # don't assign to {} in class def, define here to avoid
        # multiple pyclient objects linking to a shared dict
        if self.defaults is None:
            self.defaults = {}
        if self.fields is None:
            self.fields = {}
        if self._cache is None:
            self._cache = {}
        if self.config is None:
            self.config = {}

        self._config_file = config_file or Config.default_config

        # all defaults are loaded, unless specified in
        # metrique_config.json
        self.load_config(**kwargs)

        # cube class defined name
        self._cube = type(self).name

        # set name if passed in, but don't overwrite default if not
        self.name = name or self.name

        self.config.logdir = os.path.expanduser(self.config.logdir)
        if not os.path.exists(self.config.logdir):
            os.makedirs(self.config.logdir)
        self.config.logfile = os.path.join(self.config.logdir, self.config.logfile)

        # keep logging local to the cube so multiple
        # cubes can independently log without interferring
        # with each others logging.
        self.debug_setup()

    ####################### data loading api ###################
    def load_files(self, path, filetype=None, **kwargs):
        """Load multiple files from various file types automatically.

        Supports glob paths, eg::

            path = 'data/*.csv'

        Filetypes are autodetected by common extension strings.

        Currently supports loadings from:
            * csv (pd.read_csv)
            * json (pd.read_json)

        :param path: path to config json file
        :param filetype: override filetype autodetection
        :param kwargs: additional filetype loader method kwargs
        """
        # kwargs are for passing ftype load options (csv.delimiter, etc)
        # expect the use of globs; eg, file* might result in fileN (file1,
        # file2, file3), etc
        datasets = glob.glob(os.path.expanduser(path))
        objects = None
        for ds in datasets:
            filetype = path.split(".")[-1]
            # buid up a single dataframe by concatting
            # all globbed files together
            objects = pd.concat([self._load_file(ds, filetype, **kwargs) for ds in datasets]).T.as_dict().values()
        return objects

    def _load_file(self, path, filetype, **kwargs):
        if filetype in ["csv", "txt"]:
            return self._load_csv(path, **kwargs)
        elif filetype in ["json"]:
            return self._load_json(path, **kwargs)
        else:
            raise TypeError("Invalid filetype: %s" % filetype)

    def _load_csv(self, path, **kwargs):
        # load the file according to filetype
        return pd.read_csv(path, **kwargs)

    def _load_json(self, path, **kwargs):
        return pd.read_json(path, **kwargs)

    ####################### objects manipulation ###################
    def df(self, objects=None):
        """Return a pandas dataframe from objects"""
        if objects:
            return pd.DataFrame(objects)
        else:
            return pd.DataFrame()

    def flush(self):
        """run garbage collection"""
        k = gc.get_count()
        result = gc.collect()  # be sure we garbage collect any old object refs
        logger.debug("Garbage Flush: %s flushed, %s remain" % (k, result))

    def normalize(self, objects):
        """Convert, validate, normalize and locally cache objects"""
        # convert from other forms to basic list of dicts
        if objects is None:
            objects = []
        elif isinstance(objects, pd.DataFrame):
            objects = objects.T.to_dict().values()
        elif isinstance(objects, tuple):
            objects = list(objects)
        else:
            assert isinstance(objects, list)
        if objects:
            if not all([type(o) is dict for o in objects]):
                raise TypeError("object values must be dict")
            if not all([o.get("_oid") is not None for o in objects]):
                raise ValueError("_oid must be defined for all objs")
            objects = self._normalize(objects)
        return objects

    def _normalize(self, objects):
        """
        give all these objects the same _start value (if they
        don't already have one), and more...
        """
        start = utcnow()
        for i, o in enumerate(objects):
            # normalize fields (alphanumeric characters only, lowercase)
            o = self._obj_fields(o)
            # convert empty strings to None (null)
            o = self._obj_nones(o)
            # add object meta data the metriqued requires be set per object
            o = self._obj_end(o)
            o = self._obj_start(o, start)
            objects[i] = o
        return objects

    def _normalize_fields(self, k):
        k = k.lower()
        k = space_re.sub("_", k)
        k = fields_re.sub("", k)
        k = unda_re.sub("_", k)
        return k

    def _obj_fields(self, obj):
        """ periods and dollar signs are not allowed! """
        # replace spaces, lowercase keys, remove non-alphanumeric
        # WARNING: only lowers the top level though, at this time!
        return dict((self._normalize_fields(k), v) for k, v in obj.iteritems())

    def _obj_nones(self, obj):
        return dict([(k, None) if v == "" else (k, v) for k, v in obj.items()])

    def _obj_end(self, obj, default=None):
        obj["_end"] = obj.get("_end", default)
        return obj

    def _obj_start(self, obj, default=None):
        _start = obj.get("_start", default)
        obj["_start"] = _start or utcnow()
        return obj

    def oids(self, objects):
        """Return back a list of _oids for all locally cached objects"""
        return [o["_oid"] for o in objects]

    #################### misc ##################################
    def debug_setup(self):
        """
        Local object instance logger setup.

        Verbosity levels are determined as such::

            if level in [-1, False]:
                logger.setLevel(logging.WARN)
            elif level in [0, None]:
                logger.setLevel(logging.INFO)
            elif level in [True, 1, 2]:
                logger.setLevel(logging.DEBUG)

        If (level == 2) `logging.DEBUG` will be set even for
        the "root logger".

        Configuration options available for customized logger behaivor:
            * debug (bool)
            * logstdout (bool)
            * log2file (bool)
            * logfile (path)
        """
        level = self.config.debug
        logstdout = self.config.logstdout
        logfile = self.config.logfile
        log_format = "%(name)s.%(process)s:%(asctime)s:%(message)s"
        log_format = logging.Formatter(log_format, "%Y%m%dT%H%M%S")

        logger = logging.getLogger()
        logger.handlers = []
        if logstdout:
            hdlr = logging.StreamHandler()
            hdlr.setFormatter(log_format)
            logger.addHandler(hdlr)
        if self.config.log2file and logfile:
            hdlr = logging.FileHandler(logfile)
            hdlr.setFormatter(log_format)
            logger.addHandler(hdlr)
        self._debug_set_level(logger, level)

    def _debug_set_level(self, logger, level):
        if level in [-1, False]:
            logger.setLevel(logging.WARN)
        elif level in [0, None]:
            logger.setLevel(logging.INFO)
        elif level in [True, 1, 2]:
            logger.setLevel(logging.DEBUG)
        return logger

    def get_cube(self, cube, init=True, name=None, **kwargs):
        """wrapper for :func:`metriqueu.utils.get_cube`

        Locates and loads a metrique cube

        :param cube: name of cube to load
        :param init: (bool) initialize cube before returning?
        :param name: override the name of the cube
        :param kwargs: additional :func:`metriqueu.utils.get_cube`
        """
        config = copy(self.config)
        # don't apply the name to the current obj, but to the object
        # we get back from get_cube
        return get_cube(cube=cube, init=init, config=config, name=name, **kwargs)

    def get_property(self, property, field=None, default=None):
        """Lookup cube defined property (meta-data):

            1. First try to use the field's property, if defined.
            2. Then try to use the default property, if defined.
            3. Then use the default for when neither is found.
            4. Or return None, if no default is defined.

        :param property: property key name
        :param field: (optional) specific field to query first
        :param default: default value to return if [field.]property not found
        """
        try:
            return self.fields[field][property]
        except KeyError:
            try:
                return self.defaults[property]
            except (TypeError, KeyError):
                return default

    def load_config(self, config=None, **kwargs):
        """Try to load a config file and handle when its not available

        :param config: config file or :class:`metriqueu.jsonconf.JSONConf`
        :param kwargs: additional config key:value pairs to store
        """
        if type(config) is type(Config):
            self._config_file = config.config_file
        else:
            self._config_file = config or self._config_file
            self.config = Config(config_file=self._config_file)
        self.config.update(kwargs)

    #################### Helper API ############################
    def urlretrieve(self, uri, saveas):
        """urllib.urlretrieve wrapper"""
        return urllib.urlretrieve(uri, saveas)

    def whoami(self, auth=False):
        """Local api call to check the username of running user"""
        return self.config["username"]