def load(self, fh, **kwargs): r""" By default, this method raises Error if manager has not been configured yet. """ if not self.isConfigured(): raise Error( 'PKC Manager must be configured before proceeding with load!')
def findGroupByJobID(self, jobID): r""" Identify job group of the requested job ID. Parameters ---------- jobID : string job ID Returns ------- group_name : string name of the group with requested job, if exists Raises ------ Error if jobID is found in more than one group """ gr = self._jgmap.getBwdMap()[jobID] if len(gr) > 1: # should not happen with SetBDMap but to be safe... raise Error( 'Job shall be internally assigned to single group! (got %s)' % gr) # return single element from the set return next(iter(gr))
def removeJob(self, jobID): r""" Remove requested job from this manager. Parameters ---------- jobID : string job ID Raises ------ Warn if the job is not yet executed and/or finished Warn if the job has unrecognized status """ jobStatus = self.getJobStatus(jobID) if jobStatus == JobStatus.FINISHED: del self.jobs[jobID] elif jobStatus == JobStatus.EXECUTING: raise Warn('The job is not yet finished!') elif jobStatus == JobStatus.ADDED: raise Warn('The job is not yet executed!') else: st = ','.join([quote(s) for s in JobStatus.statuses]) raise Error('Unknown job status! (got %s) (shall be %s)' % (jobStatus, st))
def verifyDepModule(modname): r""" Verify that requested module is importable from within KDVS. Parameters ---------- modname : string module name to be verified Returns ------- module_instance : module instance of the verified module, as present in 'sys.modules' Raises ------ Error if requested module is not importable from within KDVS """ try: __import__(modname) return sys.modules[modname] except ImportError: fp = os.path.abspath(__file__) fn = os.path.split(fp)[1] raise Error( "To use this functionality in '%s', '%s' is required! (called in '%s')" % (fn, modname, fp))
def GO_num2id(num): r""" Resolve numerical part of GO term ID into full GO term ID. Parameters ---------- num : integer/string supposed numerical part of GO term ID Returns ------- termID : string full GO term ID Raises ------ Error if numerical part does not resolve to valid GO term ID """ err = Error('Unrecognized numeric part of GO ID! (got %s)' % quote(str(num))) if isinstance(num, int): fmt = "%%0%dd" % _goid_num_length num = fmt % num elif isinstance(num, basestring): if _goid_num_patt.match(num) is None: raise err else: raise err return _goid_idprefix + num
def getHandle(file_path, *args, **kwargs): r""" Open specified file and return its handle. Uses :meth:`~kdvs.core.provider.fileProvider` to transparently open and read compressed files; additional arguments are passed to file provider directly. Parameters ---------- file_path : string filesystem path to specified file args : iterable positional arguments to pass to file provider kwargs : dict keyword arguments to pass to file provider Returns ------- handle : file--like opened handle to the file Raises ------ Error if the file could not be accessed """ fp = os.path.abspath(file_path) if not os.path.exists(fp): raise Error('Could not access %s!' % fp) else: return fileProvider(fp, *args, **kwargs)
def __setitem__(self, key, val, replace=True): r""" Add new key--value pair to partial single mapping, with possible replacement. Parameters ---------- key : object key of new key--value pair to be added value : object value of new key--value pair to be added replace : boolean if True, if key already exists, replace existing key--value pair with given key--value pair; if False, raise Error; True by default Raises ------ Error if replacement was not requested and key--value pair is already present """ if replace is False and key in self.map: raise Error('"%s" already present in map!' % key) else: self.map.__setitem__(key, val)
def __init__(self, ref_parameters=(), **kwargs): r""" Parameters ---------- ref_parameters : iterable reference parameters to be checked against during instantiation; empty tuple by default kwargs : dict actual parameters supplied during instantiation; they will be checked against reference ones Raises ------ Error if some supplied parameters are not on the reference list """ self.parameters = None # we report any parameter not in reference iterable missing_params = set(ref_parameters) ^ set(kwargs.keys()) if len(missing_params) > 0: refps = ','.join([quote(rp) for rp in ref_parameters]) mps = ','.join([quote(mp) for mp in missing_params]) raise Error('Some parameters are not on reference list! (reflist=%s, missing/extra=%s)' % (refps, mps)) else: self.parameters = dict(kwargs)
def load(self, fh, root_tag=None): r""" Read GO release from OBO-XML file and build all data structures. XML parsing is done with :mod:`xml.etree.ElementTree` (:mod:`xml.etree.cElementTree` if possible). Parameters ---------- fh : file--like opened file handle of the OBO-XML file that contains encoded GO release; file handle must come from any recognized KDVS file provider root_tag : string/None root XML tag of OBO-XML file that will be accepted; if None, default root tag (:data:`GO_OBOXML_ROOT_TAG`) will be used; None by default Raises ------ Error if requested root tag has not been found Error if file handle comes from unrecognized file provider Error if parsing of OBO-XML is interrupted with an error (re--raised ElementTree exception) See Also -------- kdvs.core.provider.fileProvider """ super(GOManager, self).load(fh) # technical parsing data self._format = None self._release_date = None # resolve root xml tag if root_tag is None: self.root_tag = GO_OBOXML_ROOT_TAG else: self.root_tag = root_tag # resolve file type if not fh.__class__ in RECOGNIZED_FILE_PROVIDERS: raise Error('File provider instance expected! (got %s)' % (fh.__class__)) # do parsing try: self._parse(fh) # except ParseError, e: except Exception, e: raise Error('Error during parsing! (Reason: %s)' % e)
def __init__(self, arbitrary_data_root=None, provider=None, rootdbid=None): r""" Parameters ---------- arbitrary_data_root : string/None path to directory containing all database objects managed by this manager instance; also, all new database objects will be created here; if None, default path '~/.kdvs/' will be used provider : DBProvider/None concrete DBProvider instance that provides internal details about requested database system; if None, default provider for SQLite3 is used rootdbid : string/None custom ID for meta--database; if not specified, the default one will be used See Also -------- os.path.expanduser """ # resolve provider if provider: self.provider = provider else: # initialize default provider self.provider = SQLite3DBProvider() # resolve absolute data root self.def_config_path_root = os.path.expanduser('~/.%s/' % (SYSTEM_NAME_LC)) if arbitrary_data_root is None: self.abs_data_root = self.def_config_path_root else: self.abs_data_root = arbitrary_data_root self.abs_data_root = os.path.abspath(self.abs_data_root) # ---- at this point check if data root is available at all if not os.path.exists(self.abs_data_root): raise Error('Could not access data root %s!' % quote(self.abs_data_root)) # ---- create cache of opened connections self.db = {} self.db_loc = {} # ---- resolve root key if rootdbid is None: self.rootdb_key = '%s.root.db' % SYSTEM_NAME_LC else: self.rootdb_key = rootdbid # ---- init or open root db (where all non-dynamic metadata are stored) rootdb_loc = os.path.join(self.abs_data_root, self.rootdb_key) if not os.path.exists(rootdb_loc): # the trick here is to let sqlite3 create db file then init it rootdb = self.provider.connect(rootdb_loc) self.__init_rootdb(rootdb) else: # the trick here is just to connect to existing db file rootdb = self.provider.connect(rootdb_loc) self.db[self.rootdb_key] = rootdb self.db_loc[self.rootdb_key] = rootdb_loc # ---- create default path for db objects self.db_location = self.abs_data_root # ---- initialize always opened in-memory db self.memdb = self.__open_db('memdb')
def postClose(self, destPath, *args): r""" Used by subclasses. Currently used only in 'experiment' application. By default it checks if given destination path exists. """ # overloaded by subclasses # by default it checks if given destination path exists if not os.path.exists(destPath): raise Error('Destination path "%s" does not exist!' % destPath)
def getAll(self, as_dict=False, dict_on_rows=True): r""" Returns all fetched results at once, wrapped in desired structure (list or dictionary). NOTE: depending on the query itself, it may consume a lot of memory. Parameters ---------- as_dict : boolean specify if the results are to be wrapped in dictionary instead of a list; False by default dict_on_rows : boolean valid if previous argument is True; specify if dictionary should be keyed by content of the first column (effectively creating dictionary on rows), instead of column name Returns ------- result : list/dict query results wrapped either in list or in dictionary; dictionary can be keyed by column name (dictionary on columns) or content of the first column (dictionary on rows); e.g. if database has columns * 'ID', 'A', 'B', 'C' and result comprises of two tuples * ('ID_111', 1.0, 2.0, 3.0) * ('ID_222', 4.0, 5.0, 6.0), the 'dictionary on columns' will contain: * {'ID' : ['ID_111', 'ID_222'], 'A' : [1.0, 4.0], 'B' : [2.0, 5.0], 'C' : [3.0, 6.0]} and the 'dictionary on rows' will contain: * {'ID_111' : [1.0 , 2.0, 3.0], 'ID_222' : [4.0, 5.0, 6.0]} Raises ------ Error if whatever error prevented result row from being obtained; NOTE: essentially, it watches for raising of OperationalError specific for the database provider See Also -------- Cursor.fetchall OperationalError """ dberror = self.dbt.dbm.provider.getOperationalError() try: results = self.cs.fetchall() except dberror, e: raise Error( 'Cannot fetch results from cursor (desc: %s) for table %s in database %s! (Reason: %s)' % (self.cs.description, self.dbt.name, self.dbt.db_key, e))
def _verify_header(self, filehandle, header): cf = self.getCommentSkipper(filehandle) l = cf.next() l_spl = l.split(self.delimiter) if len(l_spl) != len(header): raise Error( 'Number of columns in header must match number of columns in file!' ) # rewind file -- checked first line filehandle.seek(0)
def loadAll(self, debug=False): r""" Fill the DSV table with data coming from associated DSV file. The input generator is the :data:`~kdvs.core.util.CommentSkipper` instance that is obtained automatically. This method handles all underlying low--level activities. NOTE: the associated DSV file remains open until closed with :meth:`close` method manually. Parameters ---------- debug : boolean provides debug mode for table filling; if True, collect all SQL statements produced by underlying RDBMS and return them as list of strings; if False, return None Returns ------- statements : list of string/None RDBMS SQL statements issued during table filling, if debug mode is requested; or None otherwise Raises ------ Error if underlying table has not yet been created Error if data could not be loaded for whatever reason; see DBTable.load for more details See Also -------- kdvs.fw.DBTable.DBTable.load """ if not self.isCreated(): raise Error('Underlying table must be created first!') else: try: cf = self.getCommentSkipper(self.handle) csvf = csv.reader(cf, self.dialect) return super(DSV, self).load(content=csvf, debug=debug) except Exception, e: raise Error('Could not load file content! (Reason: %s)' % e)
def __setitem__(self, key, value): r""" Raises ------ Error if element is not associated with this instance, i.e. is not one of standard elements, and was not present during initialization """ if key not in self._elements: raise Error('Element %s not associated with this instance!' % quote(key)) else: self._results[key] = value
def isEmpty(self): r""" Returns True if the table is empty, False otherwise. Raises ------ Error if table has not yet been created """ if not self.isCreated(): raise Error('DataTable %s must be created in %s first!' % (quote(self.name), quote(self.db_key))) return self.countRows() == 0
def build(self, categorizers_list, categorizers_inst_dict, initial_symbols): r""" Build categories hierarchy and symboltree. Parameters ---------- categorizers_list : iterable of string iterable of identifiers of :class:`~kdvs.fw.Categorizer.Categorizer` instances, starting from root of the tree categorizers_inst_dict : dict dictionary of :class:`~kdvs.fw.Categorizer.Categorizer` instances, identified by specified identifiers initial_symbols : iterable of string initial pool of symbols to be 'partitioned' with nested categorizers into symboltree; typically, contains all prior knowledge concepts (from single domain or all domains if necessary) Raises ------ Error if requested :class:`~kdvs.fw.Categorizer.Categorizer` instance is not found in the instances dictionary """ self._categorizersInst = categorizers_inst_dict try: self._categorizersList = [(cid, self._categorizersInst[cid]) for cid in categorizers_list] except KeyError: raise Error('Categorizer %s not found in instances dictionary! (got %s)' % (cid, self._categorizersInst)) self._symbols = initial_symbols for (_, pcat), (chcatID, _) in pairwise(self._categorizersList): pcategories = pcat.categories() for pcategory in pcategories: self.hierarchy[pcat.uniquifyCategory(pcategory)] = chcatID try: # first categories are rooted with None firstcat = self._categorizersList[0][0] self.hierarchy[None] = firstcat except IndexError: pass try: # final categories are leafed with None finalcat = self._categorizersList[-1][1] finalcts = finalcat.categories() for finalct in finalcts: self.hierarchy[finalcat.uniquifyCategory(finalct)] = None except IndexError: pass try: self._buildSymbols(None, self._symbols) except NotImplementedError: self.symboltree = None
def __init__(self, IDstr, categorizeFuncTable): r""" Parameters ---------- IDstr : string identifier for the categorizer; usage of only alphanumerical characters is preferred; descriptive identifiers are preferred due to heavy usage in KDVS logs categorizeFuncTable : dict(string->callable) function table for the categorizer; keys are category names that this categorizer will use; values are callables that return that exact category name or :data:`NOTCATEGORIZED` Raises ------ Error if identifier is not a string Error if function table is not a dictionary of it is empty Error if any key in function table is not a string """ if not isinstance(IDstr, basestring): raise Error('%s instance expected! (got %s)' % (basestring, IDstr.__class__)) if not isinstance(categorizeFuncTable, dict) or len(categorizeFuncTable) < 1: raise Error('%s non-empty instance expected! (got %s)' % (dict.__class__, categorizeFuncTable.__class__)) catkeysstr = all( [isinstance(k, basestring) for k in categorizeFuncTable.keys()]) if not catkeysstr: raise Error('Keys of categorizer function table must be strings!') self.id = IDstr # categorizeFuncTable: {category->categorize_function} self.categorizeC2F = dict(categorizeFuncTable) self.categorizeF2C = dict([(v, k) for k, v in self.categorizeC2F.iteritems()])
def __open_db(self, db_id='memdb'): if not isinstance(db_id, basestring): raise Error('Invalid database ID! (got %s)' % db_id.__class__) if db_id != 'memdb': # file-based db requested, form proper name db_path = os.path.abspath(os.path.join(self.db_location, '%s.db' % db_id)) if not os.path.exists(db_path): _created = True _msg = 'create' else: _created = False _msg = 'open' else: # new in-memory db requested db_path = ':memory:' _created = True _msg = 'create' try: db = self.provider.connect(db_path) if _created is True: self.db[db_id] = db self.db_loc[db_id] = db_path if db_id != 'memdb': # record opened file-based database hname = socket.gethostname() if len(hname) == 0: hname = '<unknown>' db_path = hname + '://' + db_path if _created is True: cr_rec = '1' else: cr_rec = '0' rootdb = self.db[self.rootdb_key] c = rootdb.cursor() c.execute('insert into DB values (?, ?, ?)', (db_id, cr_rec, quote(db_path))) rootdb.commit() c.close() return db except Exception, e: raise Error('Cannot %s database %s in %s! (Reason: %s)' % (_msg, quote(db_id), db_path, e))
def _resolve_dialect(self, filehandle, sniff_line_count=10): peek_lines = list(itertools.islice(filehandle, sniff_line_count)) cf = self.getCommentSkipper(peek_lines) cflines = ''.join([l for l in cf]) buf = StringIO.StringIO(cflines).getvalue() try: self.dialect = csv.Sniffer().sniff(buf) self.delimiter = self.dialect.delimiter filehandle.seek(0) except csv.Error, e: raise Error( 'Could not determine dialect for file %s! (Reason: %s)' % (quote(filehandle.name), e))
def _finalize_new_location(self, sublocs): locname = self.sublocation_separator.join(sublocs) locpathelem = os.path.sep.join(sublocs) locpath = os.path.join(self.abs_root_path, locpathelem) if self.getLocation(locname) is None: self.locations[locname] = locpath if not os.path.exists(locpath): try: os.makedirs(locpath) except os.error, e: raise Error( 'Could not create location %s under manager %s! (Reason: %s)' % (quote(self.name), quote(locname), e))
def _checkTableExists(self, conn, tablename): try: c = conn.cursor() c.execute( 'select name from sqlite_master where type="table" and name="%s"' % tablename) tname = c.fetchone() if tname: tname = tname[0] return str(tname) == tablename except self.getOperationalError(), e: raise Error('Cannot check existence of table %s! (Reason: %s)' % (quote(tablename), e))
def __init__(self, main_dtable, pkcidmap_inst): r""" Parameters ---------- main_dbtable : :class:`~kdvs.fw.DBTable.DBTable` database table that holds primary non--partitioned input data set with all measurements; overlapping subsets will be created based on it pkcidmap_inst : :class:`~kdvs.fw.Map.PKCIDMap` concrete instance of fully constructed PKCIDMap that contains mapping between individual measurements and prior knowledge concepts; overlapping subsets will be created based on that mapping """ super(PKDrivenDBDataManager, self).__init__() if not isinstance(main_dtable, DBTable): raise Error('%s instance expected! (%s found)' % (DBTable, className(main_dtable))) if not isinstance(pkcidmap_inst, PKCIDMap): raise Error('%s instance expected! (%s found)' % (PKCIDMap, className(pkcidmap_inst))) self.pkcidmap = pkcidmap_inst self.dtable = main_dtable self.all_samples = self._get_all_samples()
def __init__(self, in_dict): r""" Parameters ---------- in_dict : dict dictionary containing simplified directives; the constructor checks if all required elements are present Raises ------ Error if list/tuple of column names are not specified and/or empty if input dictionary is missing any of required elements """ self._tmpl = dict() if all(k in in_dict for k in dbtemplate_keys): cls = in_dict['columns'] if not isListOrTuple(cls) or len(cls) == 0: raise Error('Non-empty list or tuple expected! (got %s)' % cls.__class__) self._tmpl.update(in_dict) else: raise Error('%s must contain all of the following keys: %s !' % ( quote(className(self)), quote(' '.join(dbtemplate_keys))))
def __init__(self, jobs_to_execute): r""" Parameters ---------- jobs_to_execute : iterable of :class:`~kdvs.fw.Job.Job` jobs to be executed Raises ------ Error if iterable is incorrectly specified """ if not isListOrTuple(jobs_to_execute): raise Error('List or tuple expected! (got %s)' % jobs_to_execute.__class__) self.jobs_to_execute = jobs_to_execute
def order(self): r""" Returns the ordering built by this orderer. Returns ------- ordering : iterable properly ordered iterable of data subset IDs Raises ------ Error if ordering has not been built yet """ if self.ordering is None: raise Error('Ordering has not been built!') else: return self.ordering
def load(self, content=emptyGenerator(), debug=False): r""" Fill the already created table with some data, coming from specified generator callable. Parameters ---------- content : generator callable generator callable that furnish the data; this method DOES NOT check the correctness of furnished data, this is left to the user; by default, empty generator callable is used debug : boolean provides debug mode for table filling; if True, collect all SQL statements produced by underlying RDBMS and return them as list of strings; if False, return None Returns ------- statements : list of strings/None RDBMS SQL statements issued during table filling, if debug mode is requested; or None otherwise Raises ------ Error if table filling was interrupted with an error; essentially, reraise OperationalError from underlying RDBMS """ statements = [] cs = self.db.cursor() dberror = self.dbm.provider.getOperationalError() # ---- load content for cont in content: if len(cont) > 0: ct = ','.join([quote(f) for f in cont]) st = 'insert into %s values (%s)' % (quote(self.name), ct) if debug: statements.append(st) else: try: cs.execute(st) except dberror, e: raise Error('Cannot insert content %s into table %s in database %s! (Reason: %s)' % (quote(ct), quote(self.name), quote(self.db_key), e))
def create(self, indexed_columns='*', debug=False): r""" Physically create the table in underlying RDBMS; the creation is deferred until this call. The table is created empty. Parameters ---------- indexed_columns : list/tuple/'*' list/tuple of column names to be indexed by underlying RDBMS; if string '*' is specified, all columns will be indexed; '*' by default debug : boolean provides debug mode for table creation; if True, collect all SQL statements produced by underlying RDBMS and return them as list of strings; if False, return None Returns ------- statements : list of strings/None RDBMS SQL statements issued during table creation, if debug mode is requested; or None otherwise Raises ------ Error if table creation or indexing was interrupted with an error; essentially, reraise OperationalError from underlying RDBMS """ statements = [] # ---- create table cs = self.db.cursor() dberror = self.dbm.provider.getOperationalError() ctype = self.dbm.provider.getTextColumnType() # make columns cols = ','.join(['%s %s' % (quote(c), ctype) for c in self.columns]) # make statement st = 'create table %s (%s)' % (quote(self.name), cols) if debug: statements.append(st) else: try: cs.execute(st) except dberror, e: raise Error('Cannot create table %s in database %s! (Reason: %s)' % (quote(self.name), quote(self.db_key), e))
def deriveMap(key, maps): r""" Derive single final value for given single key, computed across all given partial single mappings. Parameters ---------- key : object key for which the final value will be derived maps : iterable of :class:`ChainMap` all single partial mappings, wrapped in ChainMap instance, that will be used for deriving the final value, in given order Returns ------- key, interms, value : object/NOTMAPPED, list of object, object/None the tuple with the following elements: lookup key or NOTMAPPED if at any stage of derivation NOTMAPPED was encountered; all intermediate values encountered during derivation; final derived value or None if not found Raises ------ Error if iterable of partial single maps is incorrectly specified """ interm = [] if not all(isinstance(m, ChainMap) for m in maps): raise Error('Iterable of %s expected! (got %s)' % (ChainMap.__class__, maps)) worklist = [] worklist.extend(maps) worklist.insert(0, key) while len(worklist) > 1: interm_key = worklist.pop(0) interm_map = worklist.pop(0) val_interm_key = interm_map[interm_key] if val_interm_key == NOTMAPPED: return (NOTMAPPED, interm, interm_map) interm.append(val_interm_key) worklist.insert(0, val_interm_key) return (worklist[0], interm, None)
def _extract_header(self, filehandle, makeMissingID): cf = self.getCommentSkipper(filehandle) l1 = cf.next() l2 = cf.next() l1_spl = [s.strip() for s in l1.split(self.delimiter)] l2_spl = [s.strip() for s in l2.split(self.delimiter)] if len(l1_spl) == len(l2_spl): if makeMissingID: if len(l1_spl[0]) == 0: l1_spl.pop(0) l1_spl.insert(0, DSV_DEFAULT_ID_COLUMN) self.header = l1_spl # rewind exactly after header # first rewind file to the beginning filehandle.seek(0) # then consume header cf.next() else: raise Error( 'Could not determine header for file %s! (Wrong header line?)' % quote(filehandle.name))