def countRows(self): r""" Counts number of rows for the table. Table must be filled to obtain count >0. Counting is performed with SQL standard function 'count' in underlying RDBMS. Returns ------- count : integer/None row count for the table, or None if underlying 'count' returns null Raises ------ Error if table is not yet created Error if counting was interrupted with an error; essentially, reraise OperationalError from underlying RDBMS """ if not self.isCreated(): raise Error('DataTable %s in %s must be first created!' % (quote(self.name), quote(self.db_key))) cs = self.db.cursor() cnt_st = 'select count(%s) from %s' % (quote(self.id_column), self.name) dberror = self.dbm.provider.getOperationalError() try: cs.execute(cnt_st) except dberror, e: raise Error('Cannot count from table %s in database %s! (Reason: %s)' % (quote(self.name), quote(self.db_key), e))
def getIDs(self): r""" Get content of designated ID column and return it as list of values. Returns ------- IDs : list of strings list of values from ID column, as queried by underlying RDBMS; the list is sorted in insert order Raises ------ Error if table has not yet been created Error if querying of ID column was interrupted with an error; essentially, reraise OperationalError from underlying RDBMS """ if not self.isCreated(): raise Error('DataTable %s in %s must be first created!' % (quote(self.name), quote(self.db_key))) cs = self.db.cursor() st = 'select %s from %s' % (quote(self.id_column), self.name) dberror = self.dbm.provider.getOperationalError() try: cs.execute(st) except dberror, e: raise Error('Cannot get IDs from table %s in database %s! (Reason: %s)' % (quote(self.name), quote(self.db_key), e))
def correctHGNCApprovedSymbols(hgnc_dsv): r""" Manually correct HGNC data after loading into DB and wrapping into :class:`~kdvs.fw.DSV.DSV` instance. It removes suffix (defined in :data:`HGNC_WITHDRAWN_GS_PART`) from withdrawn symbols in order to fasten further querying. This call must be used in the application that uses HGNC data, and must be made after HGNC data has been loaded and wrapped into DSV instance. Parameters ---------- hgnc_dsv : :class:`~kdvs.fw.DSV.DSV` valid instance of DSV that contains HGNC data """ # Approved Symbol column in HGNC DSV data contains symbols with textual # suffix when symbol is withdrawn; to unify querying, we need to get # rid of the suffix # NOTE: in this exceptional case we use low level access to database # table in order to perform specific modification db = hgnc_dsv.db c = db.cursor() # c.execute('update or ignore %s ' # 'set "Approved Symbol"=rtrim("Approved Symbol","~withdrawn") ' # 'where "Approved Symbol" like "%~withdrawn"'%(hgnc_table_name)) st = 'update or ignore %s ' \ 'set %s=rtrim(%s,%s) ' \ 'where %s like %s' % ( hgnc_dsv.name, quote(HGNC_APPROVED_SYMBOL_COL), quote(HGNC_APPROVED_SYMBOL_COL), quote(HGNC_WITHDRAWN_GS_PART), quote(HGNC_APPROVED_SYMBOL_COL), quote('%' + HGNC_WITHDRAWN_GS_PART)) c.execute(st) db.commit() c.close()
def __init__(self, name=None, root_path=None, create_dbm=False): r""" Parameters ---------- name : string/None name of the current instance; it will be used to identify all managed locations; if None, the name is generated randomly (UUID4) root_path : string/None directory path that refers to the root of locations that will be managed by this instance; if None, default root path will be used ('~/.kdvs/') create_dbm : boolean if True, default :class:`~kdvs.core.db.DBManager` instance will be created as well, rooted on specified root path; False by default See Also -------- uuid os.path.expanduser """ # ---- resolve instance name if name is None: self.name = uuid.uuid4().hex else: self.name = name # ---- resolve root_path self.def_root_path = os.path.expanduser('~/.%s/' % (SYSTEM_NAME_LC)) if root_path is None: self.root_path = self.def_root_path else: self.root_path = root_path self.abs_root_path = os.path.abspath(self.root_path) # ---- check if root path is available and writable if not os.path.exists(self.abs_root_path): raise Error('Could not access root path %s for manager %s!' % (quote(self.abs_root_path), quote(self.name))) if not self._check_path_writable(self.abs_root_path): raise Error('Could not write to root path %s of manager %s!' % (quote(self.abs_root_path), quote(self.name))) # ---- setup locations management self.locations = {} self.sublocation_separator = SUBLOCATION_SEPARATOR # add ROOT location self.root_location_id = 'ROOT_%s' % self.name self.locations[self.root_location_id] = self.abs_root_path # ---- resolve DBM if create_dbm: # create default DBManager with file-based SQLite3 provider self.dbm = DBManager(self.root_path) else: self.dbm = None
def _finalize_new_location(self, sublocs): locname = self.sublocation_separator.join(sublocs) locpathelem = os.path.sep.join(sublocs) locpath = os.path.join(self.abs_root_path, locpathelem) if self.getLocation(locname) is None: self.locations[locname] = locpath if not os.path.exists(locpath): try: os.makedirs(locpath) except os.error, e: raise Error( 'Could not create location %s under manager %s! (Reason: %s)' % (quote(self.name), quote(locname), e))
def _delete_location_from_sublocs(self, sublocs): locname = self.sublocation_separator.join(sublocs) locpath = self.getLocation(locname) try: del self.locations[locname] except KeyError: raise Error('Location %s not found under manager %s!' % (quote(locname), quote(self.name))) try: shutil.rmtree(locpath) except (OSError, os.error) as e: raise Error( 'Could not remove location %s under manager %s! (Reason: %s)' % (quote(locname), quote(self.name), e))
def GO_num2id(num): r""" Resolve numerical part of GO term ID into full GO term ID. Parameters ---------- num : integer/string supposed numerical part of GO term ID Returns ------- termID : string full GO term ID Raises ------ Error if numerical part does not resolve to valid GO term ID """ err = Error('Unrecognized numeric part of GO ID! (got %s)' % quote(str(num))) if isinstance(num, int): fmt = "%%0%dd" % _goid_num_length num = fmt % num elif isinstance(num, basestring): if _goid_num_patt.match(num) is None: raise err else: raise err return _goid_idprefix + num
def removeJob(self, jobID): r""" Remove requested job from this manager. Parameters ---------- jobID : string job ID Raises ------ Warn if the job is not yet executed and/or finished Warn if the job has unrecognized status """ jobStatus = self.getJobStatus(jobID) if jobStatus == JobStatus.FINISHED: del self.jobs[jobID] elif jobStatus == JobStatus.EXECUTING: raise Warn('The job is not yet finished!') elif jobStatus == JobStatus.ADDED: raise Warn('The job is not yet executed!') else: st = ','.join([quote(s) for s in JobStatus.statuses]) raise Error('Unknown job status! (got %s) (shall be %s)' % (jobStatus, st))
def generateHGNCSynonyms(hgnc_dsv, map_db_key): r""" Create helper table that eases resolving of synonymic gene symbols with HGNC data. The helper table may be created in different subordinated database than original HGNC data. The table is specified via template. Parameters ---------- hgnc_dsv : :class:`~kdvs.fw.DSV.DSV` valid instance of DSV that contains HGNC data map_db_key : string ID of the database that will hold helper table Returns ------- synonymsDT : :class:`~kdvs.fw.DBTable.DBTable` DBTable wrapper for newly created helper table See Also -------- kdvs.fw.DBTable.DBTemplate """ synonyms_dt = DBTable.fromTemplate(hgnc_dsv.dbm, map_db_key, HGNCSYNONYMS_TMPL) synonyms_dt.create(indexed_columns=HGNCSYNONYMS_TMPL['indexes']) syn_columns = (HGNC_APPROVED_SYMBOL_COL, HGNC_SYNONYMS_COL) syn_filter = "%s not like %s" % (quote(HGNC_SYNONYMS_COL), quote(HGNC_FIELD_EMPTY)) hgnc_cs = hgnc_dsv.get(columns=syn_columns, filter_clause=syn_filter) def _gen(): for syn_row in hgnc_cs: approved, rawsyns = [str(r) for r in syn_row] # parse synonyms if len(rawsyns) > 0: # parse synoyms syns = [s.strip() for s in rawsyns.split(HGNC_FIELDS_SEP)] else: # no synonyms, make approved its own synonym syns = [approved] for syn in syns: yield (syn, approved) synonyms_dt.load(_gen()) hgnc_cs.close() return synonyms_dt
def create(self, indexed_columns='*', debug=False): r""" Physically create the table in underlying RDBMS; the creation is deferred until this call. The table is created empty. Parameters ---------- indexed_columns : list/tuple/'*' list/tuple of column names to be indexed by underlying RDBMS; if string '*' is specified, all columns will be indexed; '*' by default debug : boolean provides debug mode for table creation; if True, collect all SQL statements produced by underlying RDBMS and return them as list of strings; if False, return None Returns ------- statements : list of strings/None RDBMS SQL statements issued during table creation, if debug mode is requested; or None otherwise Raises ------ Error if table creation or indexing was interrupted with an error; essentially, reraise OperationalError from underlying RDBMS """ statements = [] # ---- create table cs = self.db.cursor() dberror = self.dbm.provider.getOperationalError() ctype = self.dbm.provider.getTextColumnType() # make columns cols = ','.join(['%s %s' % (quote(c), ctype) for c in self.columns]) # make statement st = 'create table %s (%s)' % (quote(self.name), cols) if debug: statements.append(st) else: try: cs.execute(st) except dberror, e: raise Error('Cannot create table %s in database %s! (Reason: %s)' % (quote(self.name), quote(self.db_key), e))
def load(self, content=emptyGenerator(), debug=False): r""" Fill the already created table with some data, coming from specified generator callable. Parameters ---------- content : generator callable generator callable that furnish the data; this method DOES NOT check the correctness of furnished data, this is left to the user; by default, empty generator callable is used debug : boolean provides debug mode for table filling; if True, collect all SQL statements produced by underlying RDBMS and return them as list of strings; if False, return None Returns ------- statements : list of strings/None RDBMS SQL statements issued during table filling, if debug mode is requested; or None otherwise Raises ------ Error if table filling was interrupted with an error; essentially, reraise OperationalError from underlying RDBMS """ statements = [] cs = self.db.cursor() dberror = self.dbm.provider.getOperationalError() # ---- load content for cont in content: if len(cont) > 0: ct = ','.join([quote(f) for f in cont]) st = 'insert into %s values (%s)' % (quote(self.name), ct) if debug: statements.append(st) else: try: cs.execute(st) except dberror, e: raise Error('Cannot insert content %s into table %s in database %s! (Reason: %s)' % (quote(ct), quote(self.name), quote(self.db_key), e))
def isEmpty(self): r""" Returns True if the table is empty, False otherwise. Raises ------ Error if table has not yet been created """ if not self.isCreated(): raise Error('DataTable %s must be created in %s first!' % (quote(self.name), quote(self.db_key))) return self.countRows() == 0
def _resolve_dialect(self, filehandle, sniff_line_count=10): peek_lines = list(itertools.islice(filehandle, sniff_line_count)) cf = self.getCommentSkipper(peek_lines) cflines = ''.join([l for l in cf]) buf = StringIO.StringIO(cflines).getvalue() try: self.dialect = csv.Sniffer().sniff(buf) self.delimiter = self.dialect.delimiter filehandle.seek(0) except csv.Error, e: raise Error( 'Could not determine dialect for file %s! (Reason: %s)' % (quote(filehandle.name), e))
def __str__(self): if self.result != NOTPRODUCED: res = "PRODUCED" else: res = str(NOTPRODUCED) try: cfunc = quote(self.call_func.__name__) except: cfunc = "<Function>" if len(self.call_args) <= DEFAULT_CALLARGS_LISTING_THR: cargs = 'args: %s' % str(self.call_args) else: cargs = "args: >%s" % DEFAULT_CALLARGS_LISTING_THR jobstr = "<Job (%s with %s) (Status: %s, Result: %s)>" % ( cfunc, cargs, self.status, res) return jobstr
def test_loadall4(self): dsv2_fh = DSV.getHandle(self.anno_dsv_path) dsv2 = DSV(self.dbm, self.testdb, dsv2_fh, dtname=self.test_dtname, comment=self.anno_comment) self.assertSequenceEqual(self.anno_header, dsv2.header) dsv2.create() dsv2.loadAll() dsv2.close() # low level checks cs = dsv2.db.cursor() cs.execute('select %s from %s' % (self.anno_header[0], dsv2.name)) rres = cs.fetchall() res = [str(r[0]) for r in rres] self.assertSequenceEqual(self.anno_rows, res) cols = ','.join([quote(c) for c in self.anno_columns]) cs.execute('select %s from %s' % (cols, dsv2.name)) rres = cs.fetchall() res = {} for ix, ac in enumerate(self.anno_columns): res[ac] = [str(r[ix]) for r in rres] self.assertDictEqual(self.anno_columns_dict, res)
def _extract_header(self, filehandle, makeMissingID): cf = self.getCommentSkipper(filehandle) l1 = cf.next() l2 = cf.next() l1_spl = [s.strip() for s in l1.split(self.delimiter)] l2_spl = [s.strip() for s in l2.split(self.delimiter)] if len(l1_spl) == len(l2_spl): if makeMissingID: if len(l1_spl[0]) == 0: l1_spl.pop(0) l1_spl.insert(0, DSV_DEFAULT_ID_COLUMN) self.header = l1_spl # rewind exactly after header # first rewind file to the beginning filehandle.seek(0) # then consume header cf.next() else: raise Error( 'Could not determine header for file %s! (Wrong header line?)' % quote(filehandle.name))
def build(self, anno_dsv, hgnc_dsv, map_db_key): r""" Construct the mapping using resources already present in KDVS DB (via :class:`~kdvs.core.db.DBManager`) and wrapped in :class:`~kdvs.fw.DSV.DSV` instances. The mapping is built as database table and wrapped into :class:`~kdvs.fw.DBTable.DBTable` instance; it is stored in public attribute :attr:`dbt` of this instance. After the build is finished, the public attribute :attr:`built` is set to True. This builder requires both Affymetrix annotations data and HGNC data already loaded in KDVS DB and wrapped in DSV instances. Refer to the comments for resolvancy protocol used. Parameters ---------- anno_dsv : :class:`~kdvs.fw.DSV.DSV` valid instance of DSV that contains Affymetrix annotations data hgnc_dsv : :class:`~kdvs.fw.DSV.DSV` valid instance of DSV that contains HGNC data map_db_key : string ID of the database that will hold mapping table Raises ------ Error if DSV containing Affymetrix annotation data is incorrectly specified, is not created, or is empty Error if DSV containing HGNC data is incorrectly specified, is not created, or is empty """ # # NOTE: this map follows resolvancy protocol implemented originally in # KDVS v 1.0. # # NOTE: in this map, we apply the following resolvancy protocol: # 1. get gene symbol(s) from annotations # 2. resolve them in HGNC data as follows: # - if the symbol is approved and refers to gene, retain it # - if the symbol is not approved, discard it # 3. for not discarded symbol(s) obtained in (2), get the following # element(s) from HGNC data: Entrez Gene ID, Ensembl Gene ID, RefSeq IDs # 4. for not discarded symbol(s) obtained in (2), get the following # element(s) from annotations: GB accession # # NOTE: in this map, we rely on annotations as the source of external # IDs; we still retain sequence public ID from annotations; gene symbols # are only verified if approved (i.e. the approval history is not followed) # # ---- check conditions for ANNO if not isinstance(anno_dsv, DSV): raise Error('%s instance expected! (got %s)' % (DSV.__class__, anno_dsv.__class__)) if not anno_dsv.isCreated(): raise Error('Helper data table %s must be created first!' % quote(anno_dsv.name)) if anno_dsv.isEmpty(): raise Error('Helper data table %s must not be empty!' % quote(anno_dsv.name)) # ---- check conditions for HGNC if not isinstance(hgnc_dsv, DSV): raise Error('%s instance expected! (got %s)' % (DSV.__class__, hgnc_dsv.__class__)) if not hgnc_dsv.isCreated(): raise Error('Helper data table %s must be created first!' % quote(hgnc_dsv.name)) if hgnc_dsv.isEmpty(): raise Error('Helper data table %s must not be empty!' % quote(hgnc_dsv.name)) # ---- create em2annotation em2annotation_dt = DBTable.fromTemplate(anno_dsv.dbm, map_db_key, EM2ANNOTATION_TMPL) em2annotation_dt.create(indexed_columns=EM2ANNOTATION_TMPL['indexes']) # pre-query data from HGNC: approved symbol, Entrez Gene ID, # Ensembl Gene ID, RefSeq IDs; filter non-gene entries # NOTE: we need cursor due to sheer quantity of data hgnc_data = dict() only_genes_filter = "%s in (%s)" % (quote( self._HGNC_LOCUS_TYPE_COL), ','.join( [quote(t) for t in self._HGNC_VALID_GENE_TYPES])) query_hgnc_columns = (self._HGNC_APPROVED_SYMBOL_COL, self._HGNC_EGENE_ID_COL, self._HGNC_ENSEMBL_ID_COL, self._HGNC_REFSEQ_ID_COL) hgnc_cs = hgnc_dsv.get(columns=query_hgnc_columns, filter_clause=only_genes_filter) for hgnc_row in hgnc_cs: approved, egeneid, ensembl, refseq = [str(r) for r in hgnc_row] hgnc_data[approved] = (egeneid, ensembl, refseq) hgnc_cs.close() # query ANNO for basic annotations: probeset ID, representative public ID, # gene symbol, GB accession # NOTE: we need cursor due to sheer quantity of data query_anno_columns = (anno_dsv.id_column, self._ANNO_REPR_PUBLIC_ID_COL, self._ANNO_SEQ_SOURCE_COL, self._ANNO_GENE_SYMBOL_COL, self._ANNO_GB_ACC_COL) anno_cs = anno_dsv.get(columns=query_anno_columns) def _gen(): for arow in anno_cs: probeset_id, repr_pub_id, seqsrc, gss_str, gbacc = [ str(ar) for ar in arow ] # reconstruct correct public ID pubid = '%s:%s' % (self._ANNO_SEQSRC_ABBRS[seqsrc], repr_pub_id) # separate gene symbols gss = [ s.strip() for s in gss_str.split(self._ANNO_MULTIFIELD_SEP) ] gs_rec = list() egeneid_rec = list() ensembl_rec = list() refseq_rec = list() for gs in gss: if gs in hgnc_data: # gene symbol is approved gs_rec.append(gs) egeneid, ensembl, refseq = hgnc_data[gs] egeneid_rec.append(egeneid) ensembl_rec.append(ensembl) refseq_rec.append(refseq) gs_s = MULTIFIELD_SEP.join(gs_rec) egeneid_s = MULTIFIELD_SEP.join( egeneid_rec) if len(egeneid_rec) > 0 else '' ensembl_s = MULTIFIELD_SEP.join( ensembl_rec) if len(ensembl_rec) > 0 else '' refseq_s = MULTIFIELD_SEP.join( refseq_rec) if len(refseq_rec) > 0 else '' yield (probeset_id, gs_s, pubid, gbacc, egeneid_s, ensembl_s, refseq_s) em2annotation_dt.load(_gen()) # ---- query em2annotation and build map # NOTE: we need cursor due to sheer quantity of data query_em2a_columns = (em2annotation_dt.id_column, 'gene_symbol') em2a_cs = em2annotation_dt.get(columns=query_em2a_columns) for em2a_row in em2a_cs: pr_id, gs = [str(r) for r in em2a_row] self.gene2emid[gs] = pr_id em2a_cs.close() self.built = True self.dbt = em2annotation_dt
class DBTable(object): r""" Low--level wrapper over database table managed by KDVS DB manager. KDVS uses database tables to manage query--intensive information, such as the robust generation of data subsets from single main input data set. The wrapper encapsulates basic functionality incl. table creation, table filling from specific generator function, querying with conditions over colums and rows (in case where first column holds row IDs), generation of associated :class:`numpy.ndarray` object (if possible), as well as basic counting routines. """ def __init__(self, dbm, db_key, columns, name=None, id_col=None): r""" Parameters ---------- dbm : :class:`~kdvs.core.db.DBManager` an instance of DB manager that is managing this table db_key : string internal ID of the table used by DB manager instance; it is NOT the name of physical database table in underlying RDBMS; typically, user of DB manager refers to the table by this ID and not by its physical name columns : list/tuple of strings column names for the table name : string/None physical name of database table in underlying RDBMS; if None, the name is generated semi--randomly; NOTE: ordinary user of DB manager shall refer to the table with 'db_key' ID id_col : string/None designates specific column to be "ID column"; if None, the first column is designated as ID column Raises ------ Error if DBManager instance is not present Error if list/tuple with column names is not present Error if ID column name is not the one of existing columns """ # ---- resolve DBManager if not isinstance(dbm, DBManager): raise Error('%s instance expected! (got %s)' % (DBManager.__class__, dbm.__class__)) else: self.dbm = dbm # ---- get target DB db = dbm.getDB(db_key) self.db_key = db_key self.db = db # ---- resolve columns if isListOrTuple(columns): self.columns = tuple(columns) else: raise Error('List or tuple expected! (got %s)' % columns.__class__) # ---- resolve ID column if id_col is None: self.id_column_idx = 0 self.id_column = self.columns[0] else: if id_col in self.columns: self.id_column_idx = self.columns.index(id_col) self.id_column = id_col else: raise Error('ID column must be one of the existing columns! (got %s)' % id_col) # ---- resolve table name if name is None: self.name = '%s%s' % (self.__class__.__name__, uuid.uuid4().hex) else: self.name = name def create(self, indexed_columns='*', debug=False): r""" Physically create the table in underlying RDBMS; the creation is deferred until this call. The table is created empty. Parameters ---------- indexed_columns : list/tuple/'*' list/tuple of column names to be indexed by underlying RDBMS; if string '*' is specified, all columns will be indexed; '*' by default debug : boolean provides debug mode for table creation; if True, collect all SQL statements produced by underlying RDBMS and return them as list of strings; if False, return None Returns ------- statements : list of strings/None RDBMS SQL statements issued during table creation, if debug mode is requested; or None otherwise Raises ------ Error if table creation or indexing was interrupted with an error; essentially, reraise OperationalError from underlying RDBMS """ statements = [] # ---- create table cs = self.db.cursor() dberror = self.dbm.provider.getOperationalError() ctype = self.dbm.provider.getTextColumnType() # make columns cols = ','.join(['%s %s' % (quote(c), ctype) for c in self.columns]) # make statement st = 'create table %s (%s)' % (quote(self.name), cols) if debug: statements.append(st) else: try: cs.execute(st) except dberror, e: raise Error('Cannot create table %s in database %s! (Reason: %s)' % (quote(self.name), quote(self.db_key), e)) # ---- create indexes # resolve indexed columns if indexed_columns == '*': indexed = tuple(self.columns) else: if isListOrTuple(indexed_columns): indexed = tuple(indexed_columns) else: raise Error('List or tuple expected! (got %s)' % indexed_columns.__class__) # make indexes for ic in indexed: idx_name = '%s__%s' % (self.name, ic) idx_st = 'create index %s on %s(%s)' % (quote(idx_name), quote(self.name), quote(ic)) if debug: statements.append(idx_st) else: try: cs.execute(idx_st) except dberror, e: raise Error('Cannot create index on column %s for table %s in database %s! (Reason: %s)' % (quote(ic), quote(self.name), quote(self.db_key), e))
def get(self, columns='*', rows='*', filter_clause=None, debug=False): r""" Perform query from the table under specified conditions and return corresponding Cursor instance; the Cursor may be used immediately in straightforward manner or may be wrapped in :class:`~kdvs.fw.DBResult.DBResult` instance. Parameters ---------- columns : list/tuple/'*' list of column names that the quering will be performed from; if string '*' is specified instead, all columns will be queried; '*' by default rows: list/tuple/'*' list of rows (i.e. list of values from designated ID column) that the quering will be performed for; if string '*' is specified instead, all rows (i.e. whole content of ID column) will be queried; '*' by default filter_clause : string/None additional filtering conditions stated in the form of correct SQL WHERE clause suitable for underlying RDBMS; if None, no additional filtering is added; None by default debug : boolean provides debug mode for table querying; if True, collect all SQL statements produced by underlying RDBMS and return them as list of strings; if False, return None; False by default; NOTE: for this method, debug mode DOES NOT perform any physical querying, it just produces underlyng SQL statements and returns them Returns ------- cs/statements : Cursor/list of strings if debug mode was not requested: proper Cursor instance that may be used immediately or wrapped into DBResult object; if debug mode was requested: RDBMS SQL statements issued during table querying Raises ------ Error if list/tuple of columns/rows was specified incorrectly if specified list of columns/rows is empty if table querying was interrupted with an error; essentially, reraise OperationalError from underlying RDBMS See Also -------- :pep:`249` """ statements = [] cs = self.db.cursor() dberror = self.dbm.provider.getOperationalError() # ---- resolve columns if columns == '*': cols_st = columns else: if isListOrTuple(columns): if len(columns) == 0: raise Error('Non-empty list of columns expected!') else: raise Error('List or tuple expected! (got %s)' % columns.__class__) cols_st = ','.join([quote(c) for c in columns]) # ---- resolve rows if rows != '*': if isListOrTuple(rows): if len(rows) > 0: rs = tuple(rows) else: raise Error('Non-empty list of rows expected!') else: raise Error('List or tuple expected! (got %s)' % rows.__class__) rows_st = ','.join([quote(r) for r in rs]) else: rows_st = rows # ---- make statement if rows_st == '*': # resolve filter clause if filter_clause is not None: flt_cl = ' where %s' % filter_clause else: flt_cl = '' get_st = 'select %s from %s%s' % (cols_st, quote(self.name), flt_cl) else: # resolve filter clause if filter_clause is not None: flt_cl = ' and %s' % filter_clause else: flt_cl = '' get_st = 'select %s from %s where %s in (%s)%s' % (cols_st, quote(self.name), quote(self.id_column), rows_st, flt_cl) # ---- get content if debug: statements.append(get_st) else: try: cs.execute(get_st) except dberror, e: raise Error('Cannot select from table %s in database %s! (Reason: %s) (Cols: %s) (Rows: %s)' % ( quote(self.name), quote(self.db_key), e, columns, rows))
def __init__(self, dbm, db_key, filehandle, dtname=None, delimiter=None, comment=None, header=None, make_missing_ID_column=True): r""" Parameters ---------- dbm : :class:`~kdvs.core.db.DBManager` an instance of DB manager that is managing this table db_key : string internal ID of the table used by DB manager instance; it is NOT the name of physical database table in underlying RDBMS; typically, user of DB manager refers to the table by this ID and not by its physical name filehandle : file--like file handle to associated DSV file that contains the data that DSV table will hold; the file remains open but the data loading is deferred until requested dtname : string/None physical name of database table in underlying RDBMS; if None, the name is generated semi--randomly; NOTE: ordinary user of DB manager shall refer to the table with 'db_key' ID delimiter : string/None delimiter string of length 1 that should be used for parsing of DSV data; if None, the constructor tries to deduce delimiter by looking into first 10 lines of associated DSV file; None by default; NOTE: giving explicit delimiter instead of deducing it dynamically greatly reduces possibility of errors during parsing DSV data comment : string/None comment prefix used in associated DSV file, or None if comments are not used; None by default header : list/tuple of string / None if header is present in the form of list/tuple of strings, it will be used as list of columns for the underlying database table; if None, the constructor tries to deduce the correct header by looking into first two lines of associated DSV file; None by default; NOTE: for well formed DSV files, header should be present, so it is relatively safe to deduce it automatically make_missing_ID_column : boolean used in connection with previous argument; sometimes one can encounter DSV files that contain NO first column name in the header (e.g. generated from various R functions), and while they contain correct data, such files are syntactically incorrect; if the constructor sees lack of the first column name, it can proceed according to this parameter; if True, it inserts the content of :data:`DSV_DEFAULT_ID_COLUMN` variable as the missing column name; if False, it inserts empty string "" as the missing column name; True by default Raises ------ Error if proper comment string was not specified Error if underlying DSV dialect of associated DSV file has not been resolved correctly Error if delimiter has not been specified correctly Error if header iterable has not been specified correctly Error if parsing of DSV data during deducing was interrupted with an error; essentially, it reraises underlying csv.Error See Also -------- csv """ # ---- resolve comment if comment is not None and not isinstance(comment, basestring): raise Error('String or None expected! (got %s)' % comment) else: self.comment = comment # ---- resolve delimiter and dialect if delimiter is None: self._resolve_dialect(filehandle) else: if isinstance(delimiter, basestring) and len(delimiter) == 1: self.delimiter = delimiter try: self.dialect = _dialects[delimiter] except KeyError: raise Error( 'Dialect not identified for delimiter %s! (Sniffing required?)' % quote(delimiter)) else: raise Error('Single character expected! (got %s)' % (delimiter)) # ---- resolve header if header is None: self._extract_header(filehandle, make_missing_ID_column) else: if isListOrTuple(header): if len(header) == 0: self._autogenerate_header(filehandle, make_missing_ID_column) else: self._verify_header(filehandle, header) self.header = header else: raise Error('List or tuple expected! (got %s)' % header.__class__) # ---- DSV analysis finished, initialize underlying instance self.handle = filehandle super(DSV, self).__init__(dbm, db_key, self.header, dtname)
def _parse(self, xml_fh): # used pattern for parsing large files, as documented in: # http://effbot.org/zone/element-iterparse.htm # # ---- get context iterator context = iter(iterparse(xml_fh, events=('start', 'end'))) # ---- get root element event, root = context.next() # ---- check root tag root_tag = root.tag if root_tag != self.root_tag: raise Error( 'Could not recognize XML dialect of the release! (got root tag: %s, should be: %s)' % (quote(root_tag), quote(self.root_tag))) # ---- process other elements for event, item in context: # ---- process header if event == 'end' and item.tag.endswith('header'): # ---- get format ID self._format = self._findSingleChildText( item, 'format-version') # ---- get release date rel_date = self._findSingleChildText(item, 'date') self._release_date = datetime.strptime(rel_date, "%d:%m:%Y %H:%M") item.clear() root.clear() # ---- process individual terms if event == 'end' and item.tag.endswith('term'): # ---- get term acc term = self._findSingleChildText(item, 'id') self.terms[term] = dict() # ---- get term name name = self._findSingleChildText(item, 'name') self.terms[term]['name'] = name # ---- get term namespace namespace = self._findSingleChildText(item, 'namespace') self.domain2concepts[namespace] = term # ---- get term description desc = self._findSingleChildText(item, 'def/defstr') self.terms[term]['desc'] = desc # ---- get obsolence obs_el = item.find('is_obsolete') if obs_el is not None and obs_el.text == '1': self.terms[term]['obsolete'] = True is_obsolete = 1 obs_el.clear() else: self.terms[term]['obsolete'] = False is_obsolete = 0 # ---- process further if term not obsolete if is_obsolete == 0: # ---- get alt ids alt_id_els = item.findall('alt_id') for alt_id_el in alt_id_els: alt_id = alt_id_el.text self.synonyms[term] = alt_id alt_id_el.clear() # ---- get relationships # is_a is_a_els = item.findall('is_a') for is_a_el in is_a_els: parent = is_a_el.text if 'is_a' in self.recognizedRelations: self.termsRelationsHierarchy['is_a'][parent] = term self.termsPlainHierarchy[parent] = term is_a_el.clear() # other relationships rel_els = item.findall('relationship') for rel_el in rel_els: rel_type = self._findSingleChildText(rel_el, 'type') rel_dst = self._findSingleChildText(rel_el, 'to') if rel_type in self.recognizedRelations: self.termsRelationsHierarchy[rel_type][ term] = rel_dst self.termsPlainHierarchy[term] = rel_dst rel_el.clear() item.clear() root.clear()