def step(self, *args): if not args: raise functions.OperatorError("ontop", "No arguments") if len(args) < 3: raise functions.OperatorError("ontop", "Wrong number of arguments") if not self.size: try: self.size = int(args[0]) self.topn = queue.PriorityQueue(self.size) self.argnum = len(args) - 2 except ValueError: raise functions.OperatorError("ontop", "Wrong type in first argument") inparg = args[1] outarg = args[2:] if not self.topn.full(): self.topn.put_nowait((inparg, outarg)) else: inparg_old, outarg_old = self.topn.get_nowait() self.topn.put_nowait( max((inparg, outarg), (inparg_old, outarg_old))) self.stepsnum += 1
def step(self, *args): if not args: raise functions.OperatorError("condbreak", "No arguments") if len(args) < 4: raise functions.OperatorError("condbreak", "Wrong number of arguments") self.vals.append(list(args))
def initargs(self, args): self.init = False if not args: raise functions.OperatorError("datedifffilter", "No arguments") if len(args) < 2: raise functions.OperatorError("datedifffilter", "Wrong number of arguments") self.maxdiff = args[0]
def step(self, *args): if not args: raise functions.OperatorError("minrow", "No arguments") if len(args) != 2: raise functions.OperatorError("minrow", "Wrong number of arguments") if not self.minv: self.minv = (args[0], args[1]) elif args[0] < self.minv[0]: self.minv = (args[0], args[1])
def step(self, *args): if self.first: if not args: raise functions.OperatorError("maxrow", "No arguments") if len(args) != 2: raise functions.OperatorError("maxrow", "Wrong number of arguments") self.maxv = (args[0], args[1]) self.first = False return self.maxv = max(self.maxv, args)
def jaccard(*args): """ .. function:: jaccard(jpack1,jpack2) Return jaccard similarity value of two jpacks. Example: >>> table1(''' ... user1 movie1 20 ... user1 movie2 30 ... user2 movie1 40 ... user2 movie3 90 ... user2 movie4 90 ... user3 movie1 40 ... user3 movie3 80 ... user4 movie1 70 ... user4 movie2 10 ... ''') NOTE that only column b is jgrouped because *jaccard* operates on packs as sets, not weighted values, So packing also column c would not make any difference. >>> sql(\"""select u1.userid,u2.userid, jaccard(u1.pk, u2.pk) as similarity ... from ... (select a as userid, jgroup(b) as pk from table1 group by a) as u1, ... (select a as userid, jgroup(b) as pk from table1 group by a) as u2 ... where u1.userid<u2.userid\""") userid | userid | similarity -------------------------------- user1 | user2 | 0.25 user1 | user3 | 0.333333333333 user1 | user4 | 1.0 user2 | user3 | 0.666666666667 user2 | user4 | 0.25 user3 | user4 | 0.333333333333 """ if len(args) != 2: raise functions.OperatorError("jaccard", "operator takes exactly two arguments") try: r = jopts.fromj(args[0]) s = jopts.fromj(args[1]) except Exception as e: raise functions.OperatorError("jaccard", " Wrong format arguments: %s" % (e)) rset = set([tuple(x) if type(x) == list else x for x in r]) sset = set([tuple(x) if type(x) == list else x for x in s]) return float(len(rset & sset)) / (len(rset | sset))
def final(self): lenofvals = len(self.vals) if lenofvals <= 0: yield ("bgroupid", "C1") yield [None, None, None] return yield tuple(["bgroupid"] + ["C" + str(i) for i in range(1, len(self.vals[0]) - 1)]) counter = 0 if lenofvals != 1: for el in self.vals: try: el.insert(0, iso8601.parse_date(el[0])) except Exception: raise functions.OperatorError( "datediffnewsesid", "Wrong date format: %s" % (el[0])) self.vals.sort(key=itemgetter(0)) dt = self.vals[0][0] for el in self.vals[1:]: dtnew = el[0] diff = dtnew - dt dt = dtnew if (diff.days * 86400 + diff.seconds) > self.maxdiff: counter += 1 if counter != 0: bid = unistr(el[2]) + str(counter) yield [bid] + el[3:]
def final(self): if self.position: self.vals.sort(key=lambda x: tuple(x[-self.comparesize:])) if self.vals == []: size = 0 else: size = len(self.vals[0]) - self.comparesize - 1 if size <= 0: yield ("bgroupid", "C1") yield [None, None] return yield tuple(["bgroupid"] + ["C" + str(i) for i in range(1, size)]) counter = 0 dt = None dtpos = self.comparesize + 1 for el in self.vals: try: dtnew = iso8601.parse_date(el[-dtpos]) except Exception: raise functions.OperatorError( "datediffbreak", "Wrong date format: %s" % (el[-dtpos])) if dt and timedelta2millisec(dtnew - dt) > self.maxdiff: counter += 1 dt = dtnew bid = unistr(el[0]) + str(counter) yield [bid] + el[1:-dtpos]
def VTiter(self, *parsedArgs, **envars): largs, dictargs = self.full_parse(parsedArgs) if 'query' not in dictargs: raise functions.OperatorError( __name__.rsplit('.')[-1], "No query argument ") query = dictargs['query'] cur = envars['db'].cursor() q = cur.execute(query, parse=False) try: yield list(cur.getdescriptionsafe()) except StopIteration: try: raise finally: try: c.close() except: pass gc.disable() while True: yield q.next() gc.enable()
def VTiter(self, *parsedArgs, **envars): largs, dictargs = self.full_parse(parsedArgs) if 'query' not in dictargs: raise functions.OperatorError(__name__.rsplit('.')[-1], "No query argument ") query = dictargs['query'] cur = envars['db'].cursor() c = cur.execute(query, parse=False) schema = [] try: schema = [x[0] for x in cur.getdescriptionsafe()] except StopIteration: try: raise finally: try: c.close() except: pass yield [('rid',), ('colname',), ('val',)] lr = len(schema) while True: l = c.next() rid = l[0] for i in xrange(1, lr): yield (rid, schema[i], l[i])
def VTiter(self, *parsedArgs, **envars): largs, dictargs = self.full_parse(parsedArgs) self.nonames = True self.names = [] self.types = [] if 'query' not in dictargs: raise functions.OperatorError(__name__.rsplit('.')[-1], "No query argument ") query = dictargs['query'] cur = envars['db'].cursor() c = cur.execute(query) try: yield [('rowid', 'integer')] + list(cur.getdescriptionsafe()) except StopIteration: try: raise finally: try: c.close() except: pass i = 1 for r in c: yield [i] + list(r) i += 1
def ifthenelse(*args): """ .. function:: ifthenelse(condition, x, y) Returns *x* if *condition* is true, else returns *y*. .. templateforparams Parameters: :condition: exception type :x: exception value :y: traceback object :returns: true or false .. note:: The difference with the *if* construct in most programming languages is that *x* and *y* expressions will always be evaluated. Examples: >>> sql("select ifthenelse(1>0,'yes','no') as answer") answer ------ yes """ if len(args) < 2: raise functions.OperatorError("ifthenelse", "operator needs at least two inputs") if args[0]: return args[1] else: if len(args) > 2: return args[2] return None
def VTiter(self, *parsedArgs, **envars): largs, dictargs = self.full_parse(parsedArgs) if 'query' not in dictargs: raise functions.OperatorError( __name__.rsplit('.')[-1], "No query argument ") query = dictargs['query'] samplesize = 1 if len(largs) > 0: samplesize = int(largs[0]) if 'size' in dictargs: samplesize = int(dictargs['size']) try: samplesize = int(samplesize) except ValueError: raise functions.OperatorError( __name__.rsplit('.')[-1], "Sample size should be integer") cur = envars['db'].cursor() c = cur.execute(query, parse=False) try: yield list(cur.getdescriptionsafe()) except StopIteration: try: raise finally: try: c.close() except: pass from itertools import islice samplelist = list(islice(c, samplesize)) index = len(samplelist) from random import randint for i, row in enumerate(c, index): r = randint(0, i) if r < samplesize: samplelist[r] = row for r in samplelist: yield r
def initargs(self, args): self.init = False if not args: raise functions.OperatorError("datediffbreak", "No arguments") if len(args) < 4: raise functions.OperatorError("datediffbreak", "Wrong number of arguments") self.maxdiff = args[-1] for i in range(len(args)): if args[i] == 'order': self.position = i self.maxdiff = args[i - 1] self.comparesize = len(args) - (i + 1) if len(args) < 5: raise functions.OperatorError("datediffbreak", "Wrong number of arguments") break
def execontinue(diter, schema, *args, **kargs): if args or kargs: raise functions.OperatorError( __name__.rsplit('.')[-1], "operator takes no arguments") for el in diter: pass pass
def step(self, *args): if len(args) < 2: raise functions.OperatorError("partialsort", "Wrong number of arguments") if not self.sortnum: self.sortnum = tuple(i for i in range(args[0])) self.topn.append(args[1:]) self.stepsnum += 1 pass
def step(self, *args): if not args: raise functions.OperatorError("imax", "No arguments") if len(args) < 2: raise functions.OperatorError("imax", "Wrong number of arguments") if not self.size: try: self.size = int(args[0]) self.topn = queue.PriorityQueue(self.size) except ValueError: raise functions.OperatorError("imax", "Wrong type in first argument") # if queue is empty put value # else get topn value and compare curval = args[1] if not self.topn.full(): self.topn.put_nowait(curval) else: curless = self.topn.get() self.topn.put_nowait(max(curval, curless)) self.stepsnum += 1
def tsetcombinations(*args): """ .. function:: tsetcombinations(termset, r) -> termset Returns all the termset combinations of length r. It is a multiset operator that returns one column but many rows. .. seealso:: * :ref:`tutmultiset` functions >>> sql("select tsetcombinations('t1 t2 t3 t4',2)") C1 ----- t1 t2 t1 t3 t1 t4 t2 t3 t2 t4 t3 t4 """ if len(args) < 1: raise functions.OperatorError("tsetcombinations", "tsetcombinations operator: no input") tset = args[0] if not isinstance(args[1], int): raise functions.OperatorError("tsetcombinations", "tsetcombinations operator: second argument should be integer") yield ("C1",) for p in itertools.combinations(sorted(tset.split(' ')), args[1]): first = False yield [' '.join(p)] if first: yield ['']
def VTiter(self, *parsedArgs, **envars): largs, dictargs = self.full_parse(parsedArgs) if 'query' not in dictargs: raise functions.OperatorError( __name__.rsplit('.')[-1], "No query argument ") query = dictargs['query'] connection = envars['db'] yield (('query', 'text'), ) cur = connection.cursor() execit = cur.execute(query, parse=False) st = '' for row in execit: strow = filterlinecomment(' '.join(row)) if strow == '': continue if st != '': st += '\n' + strow else: st += strow if apsw.complete(st): yield [st] st = '' if len(st) > 0 and not re.match(r'\s+$', st, re.DOTALL | re.UNICODE): if len(st) > 35: raise functions.OperatorError( __name__.rsplit('.')[-1], "Incomplete statement found : %s ... %s" % (st[:15], st[-15:])) else: raise functions.OperatorError( __name__.rsplit('.')[-1], "Incomplete statement found : %s" % (st))
def VTiter(self, *parsedArgs, **envars): largs, dictargs = self.full_parse(parsedArgs) if 'query' not in dictargs: raise functions.OperatorError(__name__.rsplit('.')[-1], "No query argument") query = dictargs['query'] cur = envars['db'].cursor() c = cur.execute(query, parse=False) schema = [('rid',), ('row_id',)] schema_order = {} try: l = prev_l = c.next() except: yield [("c1",)] return rid = 0 record = [rid, l[0]] while l[0] == prev_l[0]: schema.append((str(l[1]),)) schema_order[l[1]] = len(record) record.append(l[2]) prev_l = l try: l = c.next() except: break yield schema yield record rid += 1 record[0] = rid record[1] = l[0] record[2] = l[2] for i in range(3, len(schema)): l = c.next() record[schema_order[l[1]]] = l[2] yield record lr = len(schema) - 2 while True: for i in range(lr): l = c.next() record[schema_order[l[1]]] = l[2] rid += 1 record[0] = rid record[1] = l[0] yield record
def step(self, *args): if self.first: if not args: raise functions.OperatorError("groupdiff", "No arguments") self.prevcomp = args[0] self.data.append(list(args)) self.first = False self.size = len(args) return if args[0] != self.prevcomp: self.prevcomp = args[0] self.data[-1].append(self.repeat) self.data.append(list(args)) self.repeat = 1 else: self.repeat += 1
def levendist(*args): """ .. function:: levendist(str1, str2) Returns *int* which is the levenshtein distance between str1 and str2 Examples: >>> sql("select levendist('HURQBOHP','QKHOZ') ") levendist('HURQBOHP','QKHOZ') ----------------------------- 7 """ if len(args) != 2: raise functions.OperatorError("levendist", "operator accepts two arguments") return sd.levenshtein_dist(args[0], args[1])
def step(self, *args): if self.notchecked: if len(args) < 2: raise functions.OperatorError("groupmax", "Wrong number of arguments") self.grouplen = args[0] self.numofargs = len(args) self.notchecked = False groupkey = args[1:self.grouplen + 1] try: group = self.groupsdict[groupkey] j = 0 for i in range(self.grouplen + 1, self.numofargs): group[j].append(args[i]) j += 1 except KeyError: self.groupsdict[groupkey] = [[x] for x in args[self.grouplen + 1:]]
def quickstrdist(*args): """ .. function:: damlevendist(str1, str2) Returns *int* which is a string distance between str1 and str2, based on Python's difflib library. It is a lot faster than levendist or damlevendist. Examples: >>> sql("select quickstrdist('HURQBOHP','QKHOZ') ") quickstrdist('HURQBOHP','QKHOZ') -------------------------------- 8 """ if len(args) != 2: raise functions.OperatorError("quickstrdist", "operator accepts two arguments") return sd.quick_string_dist(args[0], args[1])
def VTiter(self, *parsedArgs, **envars): def authorizer(operation, paramone, paramtwo, databasename, triggerorview): """Called when each operation is prepared. We can return SQLITE_OK, SQLITE_DENY or SQLITE_IGNORE""" # find the operation name plan.append([ apsw.mapping_authorizer_function[operation], paramone, paramtwo, databasename, triggerorview ]) return apsw.SQLITE_OK def buststatementcache(): c = connection.cursor() for i in range(110): a = list(c.execute("select " + str(i))) _, dictargs = self.full_parse(parsedArgs) if 'query' not in dictargs: raise functions.OperatorError( __name__.rsplit('.')[-1], " needs query argument ") query = dictargs['query'] connection = envars['db'] plan = [] buststatementcache() cursor = connection.cursor() cursor.setexectrace(lambda x, y, z: apsw.SQLITE_DENY) connection.setauthorizer(authorizer) cursor.execute(query) connection.setauthorizer(None) yield [('operation', 'text'), ('paramone', 'text'), ('paramtwo', 'text'), ('databasename', 'text'), ('triggerorview', 'text')] for r in plan: yield r
def tsetdiff(*args): """ .. function:: termsetdiff(termset1, termset2) -> termset Returns the termset that is the difference of sets of termset1 - termset2. Examples: >>> table1(''' ... 't1 t2 t3' 't2 t3' ... 't3 t2 t1' 't3 t4' ... ''') >>> sql("select tsetdiff(a,b) from table1") tsetdiff(a,b) ------------- t1 t1 t2 """ if len(args) < 2: raise functions.OperatorError("tsetdiff", "tsetdiff operator: at least two termsets should be provided") return ' '.join(sorted(set(args[0].split(' ')) - set(args[1].split(' '))))
def VTiter(self, *parsedArgs, **envars): largs, dictargs = self.full_parse(parsedArgs) if 'query' not in dictargs: raise functions.OperatorError( __name__.rsplit('.')[-1], "No query argument ") query = dictargs['query'] connection = envars['db'] yield (('column', 'text'), ('type', 'text')) cur = connection.cursor() execit = cur.execute(query, parse=False) try: samplerow = execit.next() except StopIteration: pass vals = cur.getdescriptionsafe() cur.close() for i in vals: yield i
def VTiter(self, *parsedArgs, **envars): largs, dictargs = self.full_parse(parsedArgs) if 'query' not in dictargs: raise functions.OperatorError( __name__.rsplit('.')[-1], "No query argument ") query = dictargs['query'] if 'timewindow' not in dictargs: raise functions.OperatorError( __name__.rsplit('.')[-1], "No TimeWindow argument ") else: winlen = int(dictargs['timewindow']) if 'timecolumn' not in dictargs: raise functions.OperatorError( __name__.rsplit('.')[-1], "No timecolumn argument ") else: timecolumn = int(dictargs['timecolumn']) cur = envars['db'].cursor() c = cur.execute(query, parse=False) try: yield [('wid', 'integer')] + list(cur.getdescriptionsafe()) except StopIteration: try: raise finally: try: c.close() except: pass wid = 0 secs = 0 row = c.next() firstTime = int( time.mktime(parser.parse(row[timecolumn], fuzzy=True).timetuple())) head = {firstTime: [row]} window = deque([]) while row: prev = row try: row = c.next() except StopIteration: if wid == 0: for k in head.keys(): for t in head[k]: yield (wid, ) + t for rl in window: for k in rl.keys(): for t in rl[k]: yield (wid, ) + t break secs = int( time.mktime( parser.parse(row[timecolumn], fuzzy=True).timetuple())) if secs <= firstTime + winlen: if prev[0] == row[timecolumn] and window: old = window.pop()[secs] old.append(row) rowlist = {secs: old} else: rowlist = {secs: [row]} window.append(rowlist) else: if wid == 0: for k in head.keys(): for t in head[k]: yield (wid, ) + t for rl in window: for k in rl.keys(): for t in rl[k]: yield (wid, ) + t while secs > firstTime + winlen and window: try: head = window.popleft() firstTime = head.keys()[0] except IndexError: break rowlist = {secs: [row]} window.append(rowlist) wid += 1 for k in head.keys(): for t in head[k]: yield (wid, ) + t for rl in window: for k in rl.keys(): for t in rl[k]: yield (wid, ) + t
def VTiter(self, *parsedArgs, **envars): opts = self.full_parse(parsedArgs) self.query = None self.start = 0 self.end = sys.maxint if 'start' in opts[1]: self.start = int(opts[1]['start']) if 'end' in opts[1]: self.end = int(opts[1]['end']) try: dbname = opts[0][0] except: raise functions.OperatorError( __name__.rsplit('.')[-1], "A DB filename should be provided") try: self.query = opts[1]['query'] except: pass self.dbfile = str( os.path.abspath( os.path.expandvars(os.path.expanduser( os.path.normcase(dbname))))) tablename = os.path.split(self.dbfile)[1] if 'tablename' in opts[1]: tablename = opts[1]['tablename'] if 'table' in opts[1]: tablename = opts[1]['table'] if self.query == None: self.query = 'select * from ' + tablename + ';' self.part = self.start try: self.xcon = apsw.Connection(self.dbfile + '.' + str(self.part) + '.db', flags=apsw.SQLITE_OPEN_READONLY) except Exception as e: print e raise functions.OperatorError( __name__.rsplit('.')[-1], "DB could not be opened") self.xcursor = self.xcon.cursor() self.xexec = self.xcursor.execute(self.query) yield self.findschema() while self.part < self.end: try: self.xcon = apsw.Connection(self.dbfile + '.' + str(self.part) + '.db', flags=apsw.SQLITE_OPEN_READONLY) self.xcursor = self.xcon.cursor() self.xexec = self.xcursor.execute(self.query) except apsw.CantOpenError as e: raise StopIteration gc.disable() for row in self.xexec: yield row gc.enable() self.part += 1
def VTiter(self, *parsedArgs, **envars): def exprown(row): for i in xrange(len(row)): iobj = row[i] if type(iobj) is tuple: for el in iobj[1]: for l in exprown(row[(i + 1):]): yield row[:i] + list(el) + l try: del (self.connection.openiters[iobj[0]]) except KeyboardInterrupt: raise except: pass return yield row largs, dictargs = self.full_parse(parsedArgs) if 'query' not in dictargs: raise functions.OperatorError( __name__.rsplit('.')[-1], "No query argument ") query = dictargs['query'] self.connection = envars['db'] oiters = self.connection.openiters iterheader = functions.iterheader lenIH = len(iterheader) cur = self.connection.cursor() c = cur.execute(query, parse=False) schema = cur.getdescriptionsafe() self.nonames = True types = [] orignames = [x[0] for x in schema] origtypes = [x[1] if len(x) > 1 else 'None' for x in schema] nrow = [] nnames = [] ttypes = [] try: row = c.next() except StopIteration: yield schema return rowlen = len(row) for i in xrange(rowlen): obj = row[i] if type(obj) is buffer and obj[:lenIH] == iterheader: strobj = str(obj) oiter = oiters[strobj] try: first = oiter.next() except StopIteration: first = [None] ttypes += ['GUESS'] * len(first) if noas.match(orignames[i]): badschema = False if type(first) != tuple: badschema = True for i in first: if type(first) != tuple or type(i) not in ( unicode, str) or i is None: badschema = True break if badschema: raise functions.OperatorError( __name__.rsplit('.')[-1], "First yielded row of multirow functions, should contain the schema inside a Python tuple.\nExample:\n yield ('C1', 'C2')" ) nnames += list(first) else: if len(first) == 1: nnames += [orignames[i]] else: nnames += [ orignames[i] + str(j) for j in xrange(1, len(first) + 1) ] nrow += [(strobj, oiter)] else: ttypes += [origtypes[i]] nnames += [orignames[i]] nrow += [obj] firstbatch = exprown(nrow) try: firstrow = firstbatch.next() except StopIteration: firstrow = None for i, v in enumerate(ttypes): if v == 'GUESS': try: v = getElementSqliteType(firstrow[i]) except Exception, e: v = 'text' types.append(v)