def load_config(self, config=None, **kwargs): """Try to load a config file and handle when its not available :param config: config file or :class:`metriqueu.jsonconf.JSONConf` :param kwargs: additional config key:value pairs to store """ if type(config) is type(Config): self._config_file = config.config_file else: self._config_file = config or self._config_file self.config = Config(config_file=self._config_file) self.config.update(kwargs)
class BaseClient(object): """ Low level client API which provides baseline functionality, including methods for loading data from csv and json, loading metrique client cubes, config file loading and logging setup. Essentially, cubes are data made from a list of dicts. All objects are expected to contain a `_oid` key value property. This property should be unique per individual "object" defined. For example, if we are storing logs, we might consider each log line a separate "object" since those log lines should never change in the future and give each a unique `_oid`. Or if we are storing data about 'meta objects' of some sort, say 'github repo issues' for example, we might have objects with _oids of `%(username)s_%(reponame)s_%(issuenumber)s`. Optionally, objects can contain the following additional meta-properties: * _start - datetime when the object state was set * _end - datetime when the object state changed to a new state Field names (object dict keys) must consist of alphanumeric and underscore characters only. Field names are partially normalized automatically: * non-alphanumeric characters are removed * spaces converted to underscores * letters are lowercased Property values are normalized to some extent automatically as well: * empty strings -> None Additionally, some common operation methods are provided for operations such as loading a HTTP uri and determining currently configured username. :cvar name: name of the cube :cvar defaults: cube default property container (cube specific meta-data) :cvar fields: cube fields definitions :cvar saveas: filename to use when saving cube data to disk locally :cvar config: local cube config object If cube is specified as a kwarg upon initialization, the specific cube class will be located and returned, assuming its available in sys.path. If the cube fails to import, RuntimeError will be raised. Example usage:: >>> import pyclient >>> c = pyclient(cube='git_commit') <type HTTPClient(...)> >>> z = pyclient() >>> z.get_cube(cube='git_commit') <type HTTPClient(...)> """ name = None defaults = None fields = None saveas = "" _cache = None config = None def __new__(cls, *args, **kwargs): if "cube" in kwargs and kwargs["cube"]: cls = get_cube(cube=kwargs["cube"], init=False) else: cls = cls return object.__new__(cls) def __init__(self, config_file=None, name=None, **kwargs): # don't assign to {} in class def, define here to avoid # multiple pyclient objects linking to a shared dict if self.defaults is None: self.defaults = {} if self.fields is None: self.fields = {} if self._cache is None: self._cache = {} if self.config is None: self.config = {} self._config_file = config_file or Config.default_config # all defaults are loaded, unless specified in # metrique_config.json self.load_config(**kwargs) # cube class defined name self._cube = type(self).name # set name if passed in, but don't overwrite default if not self.name = name or self.name self.config.logdir = os.path.expanduser(self.config.logdir) if not os.path.exists(self.config.logdir): os.makedirs(self.config.logdir) self.config.logfile = os.path.join(self.config.logdir, self.config.logfile) # keep logging local to the cube so multiple # cubes can independently log without interferring # with each others logging. self.debug_setup() ####################### data loading api ################### def load_files(self, path, filetype=None, **kwargs): """Load multiple files from various file types automatically. Supports glob paths, eg:: path = 'data/*.csv' Filetypes are autodetected by common extension strings. Currently supports loadings from: * csv (pd.read_csv) * json (pd.read_json) :param path: path to config json file :param filetype: override filetype autodetection :param kwargs: additional filetype loader method kwargs """ # kwargs are for passing ftype load options (csv.delimiter, etc) # expect the use of globs; eg, file* might result in fileN (file1, # file2, file3), etc datasets = glob.glob(os.path.expanduser(path)) objects = None for ds in datasets: filetype = path.split(".")[-1] # buid up a single dataframe by concatting # all globbed files together objects = pd.concat([self._load_file(ds, filetype, **kwargs) for ds in datasets]).T.as_dict().values() return objects def _load_file(self, path, filetype, **kwargs): if filetype in ["csv", "txt"]: return self._load_csv(path, **kwargs) elif filetype in ["json"]: return self._load_json(path, **kwargs) else: raise TypeError("Invalid filetype: %s" % filetype) def _load_csv(self, path, **kwargs): # load the file according to filetype return pd.read_csv(path, **kwargs) def _load_json(self, path, **kwargs): return pd.read_json(path, **kwargs) ####################### objects manipulation ################### def df(self, objects=None): """Return a pandas dataframe from objects""" if objects: return pd.DataFrame(objects) else: return pd.DataFrame() def flush(self): """run garbage collection""" k = gc.get_count() result = gc.collect() # be sure we garbage collect any old object refs logger.debug("Garbage Flush: %s flushed, %s remain" % (k, result)) def normalize(self, objects): """Convert, validate, normalize and locally cache objects""" # convert from other forms to basic list of dicts if objects is None: objects = [] elif isinstance(objects, pd.DataFrame): objects = objects.T.to_dict().values() elif isinstance(objects, tuple): objects = list(objects) else: assert isinstance(objects, list) if objects: if not all([type(o) is dict for o in objects]): raise TypeError("object values must be dict") if not all([o.get("_oid") is not None for o in objects]): raise ValueError("_oid must be defined for all objs") objects = self._normalize(objects) return objects def _normalize(self, objects): """ give all these objects the same _start value (if they don't already have one), and more... """ start = utcnow() for i, o in enumerate(objects): # normalize fields (alphanumeric characters only, lowercase) o = self._obj_fields(o) # convert empty strings to None (null) o = self._obj_nones(o) # add object meta data the metriqued requires be set per object o = self._obj_end(o) o = self._obj_start(o, start) objects[i] = o return objects def _normalize_fields(self, k): k = k.lower() k = space_re.sub("_", k) k = fields_re.sub("", k) k = unda_re.sub("_", k) return k def _obj_fields(self, obj): """ periods and dollar signs are not allowed! """ # replace spaces, lowercase keys, remove non-alphanumeric # WARNING: only lowers the top level though, at this time! return dict((self._normalize_fields(k), v) for k, v in obj.iteritems()) def _obj_nones(self, obj): return dict([(k, None) if v == "" else (k, v) for k, v in obj.items()]) def _obj_end(self, obj, default=None): obj["_end"] = obj.get("_end", default) return obj def _obj_start(self, obj, default=None): _start = obj.get("_start", default) obj["_start"] = _start or utcnow() return obj def oids(self, objects): """Return back a list of _oids for all locally cached objects""" return [o["_oid"] for o in objects] #################### misc ################################## def debug_setup(self): """ Local object instance logger setup. Verbosity levels are determined as such:: if level in [-1, False]: logger.setLevel(logging.WARN) elif level in [0, None]: logger.setLevel(logging.INFO) elif level in [True, 1, 2]: logger.setLevel(logging.DEBUG) If (level == 2) `logging.DEBUG` will be set even for the "root logger". Configuration options available for customized logger behaivor: * debug (bool) * logstdout (bool) * log2file (bool) * logfile (path) """ level = self.config.debug logstdout = self.config.logstdout logfile = self.config.logfile log_format = "%(name)s.%(process)s:%(asctime)s:%(message)s" log_format = logging.Formatter(log_format, "%Y%m%dT%H%M%S") logger = logging.getLogger() logger.handlers = [] if logstdout: hdlr = logging.StreamHandler() hdlr.setFormatter(log_format) logger.addHandler(hdlr) if self.config.log2file and logfile: hdlr = logging.FileHandler(logfile) hdlr.setFormatter(log_format) logger.addHandler(hdlr) self._debug_set_level(logger, level) def _debug_set_level(self, logger, level): if level in [-1, False]: logger.setLevel(logging.WARN) elif level in [0, None]: logger.setLevel(logging.INFO) elif level in [True, 1, 2]: logger.setLevel(logging.DEBUG) return logger def get_cube(self, cube, init=True, name=None, **kwargs): """wrapper for :func:`metriqueu.utils.get_cube` Locates and loads a metrique cube :param cube: name of cube to load :param init: (bool) initialize cube before returning? :param name: override the name of the cube :param kwargs: additional :func:`metriqueu.utils.get_cube` """ config = copy(self.config) # don't apply the name to the current obj, but to the object # we get back from get_cube return get_cube(cube=cube, init=init, config=config, name=name, **kwargs) def get_property(self, property, field=None, default=None): """Lookup cube defined property (meta-data): 1. First try to use the field's property, if defined. 2. Then try to use the default property, if defined. 3. Then use the default for when neither is found. 4. Or return None, if no default is defined. :param property: property key name :param field: (optional) specific field to query first :param default: default value to return if [field.]property not found """ try: return self.fields[field][property] except KeyError: try: return self.defaults[property] except (TypeError, KeyError): return default def load_config(self, config=None, **kwargs): """Try to load a config file and handle when its not available :param config: config file or :class:`metriqueu.jsonconf.JSONConf` :param kwargs: additional config key:value pairs to store """ if type(config) is type(Config): self._config_file = config.config_file else: self._config_file = config or self._config_file self.config = Config(config_file=self._config_file) self.config.update(kwargs) #################### Helper API ############################ def urlretrieve(self, uri, saveas): """urllib.urlretrieve wrapper""" return urllib.urlretrieve(uri, saveas) def whoami(self, auth=False): """Local api call to check the username of running user""" return self.config["username"]