def main(): if len(sys.argv) < 3: sys.stderr.write('Usage: %s outdir textfile1 textfile2 ...\n' % sys.argv[0]) sys.exit(1) outdir = sys.argv[1] tfdb = DB() if not tfdb.open(os.path.join(outdir, 'tf.kch'), DB.OWRITER | DB.OCREATE | DB.OTRUNCATE): sys.stderr.write('cannot open tfdb: %s\n' % str(tfdb.error)) sys.exit(1) dfdb = DB() if not dfdb.open(os.path.join(outdir, 'df.kch'), DB.OWRITER | DB.OCREATE | DB.OTRUNCATE): sys.stderr.write('cannot open dfdb: %s\n' % str(dfdb.error)) sys.exit(1) tfidfdb = DB() if not tfidfdb.open(os.path.join(outdir, 'tfidf.kch'), DB.OWRITER | DB.OCREATE | DB.OTRUNCATE): sys.stderr.write('cannot open tfidfdb: %s\n' % str(tfidfdb.error)) sys.exit(1) print 'Count words ...' for i in range(len(sys.argv)-2): filename = sys.argv[i+2] print '(%d/%d) %s' % (i+1, len(sys.argv)-2, filename) count_words(tfdb, dfdb, filename) print 'Calculate TFIDF ...' save_tfidf(tfdb, dfdb, tfidfdb) tfdb.close() dfdb.close() tfidfdb.close()
def main(): if len(sys.argv) < 3: sys.stderr.write('Usage: %s outdir textfile1 textfile2 ...\n' % sys.argv[0]) sys.exit(1) outdir = sys.argv[1] tfdb = DB() if not tfdb.open(os.path.join(outdir, 'tf.kch'), DB.OWRITER | DB.OCREATE | DB.OTRUNCATE): sys.stderr.write('cannot open tfdb: %s\n' % str(tfdb.error)) sys.exit(1) dfdb = DB() if not dfdb.open(os.path.join(outdir, 'df.kch'), DB.OWRITER | DB.OCREATE | DB.OTRUNCATE): sys.stderr.write('cannot open dfdb: %s\n' % str(dfdb.error)) sys.exit(1) tfidfdb = DB() if not tfidfdb.open(os.path.join(outdir, 'tfidf.kch'), DB.OWRITER | DB.OCREATE | DB.OTRUNCATE): sys.stderr.write('cannot open tfidfdb: %s\n' % str(tfidfdb.error)) sys.exit(1) print 'Count words ...' for i in range(len(sys.argv) - 2): filename = sys.argv[i + 2] print '(%d/%d) %s' % (i + 1, len(sys.argv) - 2, filename) count_words(tfdb, dfdb, filename) print 'Calculate TFIDF ...' save_tfidf(tfdb, dfdb, tfidfdb) tfdb.close() dfdb.close() tfidfdb.close()
def open_db(self, name): db = DB() db.open( join(self.path, "%s.kch"%name), DB.OWRITER | DB.OCREATE ) self.opendb.append(db) return db
class KDB: def __init__(self, path, truncate=False): """ Open a new connection to a database using the Kyoto Cabinet engine. Args: * path (str): Path to database. Kwargs: * truncate (bool, False): If database should be truncated before opening. """ self.db = DB() self.batchsize = 1000 self.batch = {} dbparams = '.kct#apow=0#bnum=10000000#msiz=' + str(2 << 30) if truncate: result = self.db.open(path + dbparams, DB.OWRITER | DB.OCREATE | DB.OTRUNCATE) else: result = self.db.open(path + dbparams, DB.OWRITER) if not result: raise PathError('DNA outdb open error: %s ' % self.db.error()) exit(1) def get(self, key): """ Retrieve the item with the given `key`. """ return self.db.get(key) def put(self, key, val): """ Put `val` at `key`. Note that disk writing is done in batches, so be sure to call `close` or `flush` to make sure that values are put into the store. """ self.batch[key] = val if len(self.batch) >= self.batchsize: self.flush() def flush(self): """ Write `put` calls to database. """ self.db.set_bulk(self.batch) self.batch = {} def close(self): """ Flush the database and delete the connection to it. """ self.flush() del self.db
def dbOpen(name): db = DB() if self.create: # if not db.open(abspath(self.path) + '/' + name + ".kch", # DB.OWRITER | DB.OCREATE | DB.OAUTOSYNC | DB.OAUTOTRAN): if not db.open(abspath(self.path) + "/" + name + ".kch", DB.OWRITER | DB.OCREATE): raise IOError("open error: " + str(db.error())) return db else: # if not db.open(abspath(self.path) + '/' + name + ".kch", # DB.OWRITER | DB.OAUTOSYNC | DB.OAUTOTRAN): if not db.open(abspath(self.path) + "/" + name + ".kch", DB.OWRITER): raise IOError("open error: " + str(db.error())) return db
def dbOpen(name): db = DB() dbpathname = abspath(self.path) + '/' + name + ".kch" if self.create: # if not db.open(abspath(self.path) + '/' + name + ".kch", # DB.OWRITER | DB.OCREATE | DB.OAUTOSYNC | DB.OAUTOTRAN): if not db.open(dbpathname, DB.OWRITER | DB.OCREATE): raise IOError("open error: %s %s" % (dbpathname, str(db.error()))) #pragma: NO COVER return db else: # if not db.open(abspath(self.path) + '/' + name + ".kch", # DB.OWRITER | DB.OAUTOSYNC | DB.OAUTOTRAN): if not db.open(dbpathname, DB.OWRITER): #pragma: NO COVER raise IOError("open error: %s %s" % (dbpathname, str(db.error()))) #pragma: NO COVER return db
def get_items(item_filter, db_file, page=0): item_iter = 0 items = [] db = DB() if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE): print "Could not open database." cur = db.cursor() cur.jump_back() while len(items) < FILTER_MAX: rec = cur.get(False) if not rec: break if item_iter != (FILTER_MAX * page): if item_filter(rec): item_iter = item_iter + 1 cur.step_back() continue if item_filter(rec): items.append(rec) cur.step_back() cur.disable() db.close() sorted_items = sorted(items, key=get_key, reverse=True) sorted_items_for_viewing = [loads(item[1]) for item in sorted_items] for item in sorted_items_for_viewing: if item['title'] is None or item['title'] == "": item['title'] = item['url'] return sorted_items_for_viewing
class DbKyoto(object): def __init__(self, db_file): #from train import TAG2ID, WORD2ID#, BAYES_RANK #self.ider = WORD2ID self.db = DB() self.db_file = db_file print path.join(KYOTO_DB_PATH,self.db_file) if not self.db.open(path.join(KYOTO_DB_PATH,self.db_file), DB.OWRITER | DB.OCREATE): print >>sys.stderr, "open error: " + str(self.db.error()) def set(self,entry): key = entry[0] result_array = convert2array(entry[1]).tostring() if not self.db.set(key,result_array): print key print result_array print >>sys.stderr, "open error: " + str(self.db.error()) def get(self,key): value = self.db.get(key) if value: result = array('L') result.fromstring(value) return convert2dict(result) else: #print >>sys.stderr, self.ider.get_word_by_id(key) #print key #print >>sys.stderr, "%s error: "%key + str(self.db.error()) pass
class DbKyoto(object): def __init__(self, db_file): #from train import TAG2ID, WORD2ID#, BAYES_RANK #self.ider = WORD2ID self.db = DB() self.db_file = db_file print path.join(KYOTO_DB_PATH, self.db_file) if not self.db.open(path.join(KYOTO_DB_PATH, self.db_file), DB.OWRITER | DB.OCREATE): print >> sys.stderr, "open error: " + str(self.db.error()) def set(self, entry): key = entry[0] result_array = convert2array(entry[1]).tostring() if not self.db.set(key, result_array): print key print result_array print >> sys.stderr, "open error: " + str(self.db.error()) def get(self, key): value = self.db.get(key) if value: result = array('L') result.fromstring(value) return convert2dict(result) else: #print >>sys.stderr, self.ider.get_word_by_id(key) #print key #print >>sys.stderr, "%s error: "%key + str(self.db.error()) pass
def aggregate_by_hour(db_file): # Initialize the dict with each hour hours = {key: 0 for key in range(0,24)} db = DB() if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE): print "Could not open database." cur = db.cursor() cur.jump_back() while True: rec = cur.get(False) if not rec: break loaded = loads(rec[1]) unix = float(loaded['created_at']) time = datetime.fromtimestamp(unix) hours[time.hour] = hours[time.hour] + 1 cur.step_back() cur.disable() db.close() hours = [{'name': "{}:00".format(key), 'data': [hours[key]]} for key in hours] return hours
def get_post_num(post_num, db_file): item = None db = DB() if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE): print "Could not open database." cur = db.cursor() cur.jump() i = 0 while True: rec = cur.get(False) if not rec: break if i == post_num: item = rec cur.step() i = i + 1 cur.disable() db.close() if item is not None: return loads(item[1]) return dict()
class DbKyoto(object): def __init__(self, db_file): self.db = DB() self.db_file = db_file if not self.db.open(db_file, DB.OWRITER | DB.OCREATE): print >> sys.stderr, 'open error: ' + str(self.db.error()) def set(self, txt, po_id): feature_list = feature_md5(txt) for feature in feature_list: key = feature entry = self.get(key) if not entry: val = array('L', [po_id]) if not self.db.set(key, val.tostring()): print >> sys.stderr, 'open error: ' + str(self.db.error()) else: val = array('L') val.fromstring(entry) if po_id not in val: val.append(po_id) self.db.set(key, val.tostring()) return val def get(self, key): po_id = self.db.get(key) result = array('L') if po_id: result.fromstring(po_id) return result
def aggregate_by_hour(db_file): # Initialize the dict with each hour hours = {key: 0 for key in range(0, 24)} db = DB() if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE): print "Could not open database." cur = db.cursor() cur.jump_back() while True: rec = cur.get(False) if not rec: break loaded = loads(rec[1]) unix = float(loaded['created_at']) time = datetime.fromtimestamp(unix) hours[time.hour] = hours[time.hour] + 1 cur.step_back() cur.disable() db.close() hours = [{ 'name': "{}:00".format(key), 'data': [hours[key]] } for key in hours] return hours
def __load_blast_data(blast): # Connect to kyoto db db = DB() if not db.open("/opt/gene2accession/gene2accession.kch", DB.OREADER): raise Exception("Could not load gene2accession.kch: " + str(db.error())) hits = {} gi_num = re.compile('gi\|([0-9]+)') for line in blast: split_line = line.split('\t') # Important data evalue = float(split_line[10]) gi_nums = gi_num.findall(split_line[12]) genome_ids = [db.get(x) for x in gi_nums if db.get(x) is not None] # Thanks to Peter's parser, the gi list and org list are the same # length (the first gi column is also the first gi in the "master" gi # column) for org in genome_ids: if org in hits: hits[org].append(evalue) else: hits[org] = [evalue] db.close() return hits
def top_things(db_file): urls = {} people = {} graph = {} db = DB() if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE): print "Could not open database. (Top things)" cur = db.cursor() cur.jump_back() while True: rec = cur.get(False) if not rec: break loaded_rec = loads(rec[1]) split = get_domain(loaded_rec) if urls.get(split, False) == False: urls[split] = 1 else: urls[split] = urls[split] + 1 person = loaded_rec['person'] if people.get(person, False) == False: people[person] = 1 else: people[person] = people[person] + 1 if split is not None and split is not "" and \ person is not None and person is not "": # Build a crazy relational graph out of my nosql data if graph.get(split, False) == False: graph[split] = { "is_person": False, "data": [person], "linked_to_count": 1 } elif person not in graph[split]: graph[split]["data"].append(person) graph[split][ "linked_to_count"] = graph[split]["linked_to_count"] + 1 if graph.get(person, False) == False: graph[person] = {"is_person": True, "data": [split]} elif split not in graph[person]: graph[person]["data"].append(split) cur.step_back() cur.disable() db.close() def get_one(x): return x[1] return (sorted(urls.items(), key=get_one, reverse=True), sorted(people.items(), key=get_one, reverse=True), graph)
def decorated_function(*args, **kwargs): # Debug if not current_app.config['CACHE']: return f(*args, **kwargs) db = DB() db.open("/tmp/page_cache.kch") res = None fancy = hash("{}{}{}".format(db_meta_info()['count'], request.url, f.func_name)) res = db.get(fancy) if not res: res = f(*args, **kwargs) db.set(fancy, res) db.close() return res
def open(self, filename): db = DB() # self.db=DB(DB.GEXCEPTIONAL) if not db.open(filename, DB.OWRITER | DB.OCREATE | DB.ONOLOCK): raise IOError("open error: '" + str(self.db.error()) + "' on file:" + filename) return db
def dbOpen(name): db = DB() dbpathname = abspath(self.path) + '/' + name + ".kch" if self.create: # if not db.open(abspath(self.path) + '/' + name + ".kch", # DB.OWRITER | DB.OCREATE | DB.OAUTOSYNC | DB.OAUTOTRAN): if not db.open(dbpathname, DB.OWRITER | DB.OCREATE): raise IOError("open error: %s %s" % ( dbpathname, str(db.error()))) # pragma: NO COVER return db else: # if not db.open(abspath(self.path) + '/' + name + ".kch", # DB.OWRITER | DB.OAUTOSYNC | DB.OAUTOTRAN): if not db.open(dbpathname, DB.OWRITER): # pragma: NO COVER raise IOError("open error: %s %s" % ( dbpathname, str(db.error()))) # pragma: NO COVER return db
def __init__(self, dbpath, readonly=False): self.readonly = readonly mode = KDB.OREADER if readonly else KDB.OREADER | KDB.OWRITER | KDB.OCREATE db = KDB() if not db.open(dbpath, mode): raise IOError("kyotocabinet.DB().open(%s, %o): %s" % (dbpath, mode, db.error())) self.db = db self.dbpath = dbpath
def insert_item(url, person, db_file, submitted_title=''): mimetype = "application/json" db = DB() if not db.open("{0}".format(db_file), DB.OWRITER | DB.OCREATE): response = {} response[ 'What happened?'] = "Couldn't open the damn database. Error: {0}".format( db.error()) return Response(dumps(response), mimetype=mimetype) if is_url_in_db(db, url): return Response('{"What happened?": "Someone '\ 'tried to submit a duplicate URL."}', mimetype=mimetype) title = url summary = "~?~" try: thing = urlopen(url, timeout=10) soup = BeautifulSoup(thing) title = soup.title.string # Do some dumb summarizing if we can def concat(a, v): return a + " " + v.strip() visible_stuff = filter(visible, soup.findAll(text=True)) summary = reduce(concat, visible_stuff, "")[:900] + "..." except: pass #return Response('{"What happened?": '\ # 'I dunno bs4 messed up somehow."}', # mimetype=mimetype) created_at = int(mktime(datetime.now().utctimetuple())) is_image = url.lower().endswith(("jpg", "jpeg", "gif", "png")) thumbnail = gen_thumbnail_for_url(url, str(created_at)) record = { "created_at": created_at, "title": title, "url": url, "person": person, "summary": summary, "person_color": PERSON_COLORS[random.randint(0, len(PERSON_COLORS) - 1)], "is_image": is_image, "thumbnail": thumbnail, "comment": submitted_title } db.set(created_at, dumps(record)) db.close() return Response('{"What happened?": "MUDADA"}', mimetype=mimetype)
def insert_item(url, person, db_file, submitted_title=''): mimetype = "application/json" db = DB() if not db.open("{0}".format(db_file), DB.OWRITER | DB.OCREATE): response = {} response['What happened?'] = "Couldn't open the damn database. Error: {0}".format(db.error()) return Response(dumps(response), mimetype=mimetype) if is_url_in_db(db, url): return Response('{"What happened?": "Someone '\ 'tried to submit a duplicate URL."}', mimetype=mimetype) title = url summary = "~?~" try: thing = urlopen(url, timeout=10) soup = BeautifulSoup(thing) title = '' if len(submitted_title) > 0: title = submitted_title else: title = soup.title.string # Do some dumb summarizing if we can func = lambda a,v: a + " " + v.strip() visible_stuff = filter(visible, soup.findAll(text=True)) summary = reduce(func, visible_stuff, "")[:900] + "..." except: pass #return Response('{"What happened?": '\ # 'I dunno bs4 messed up somehow."}', # mimetype=mimetype) created_at = int(mktime(datetime.now().utctimetuple())) is_image = url.lower().endswith(("jpg", "jpeg", "gif", "png")) thumbnail = gen_thumbnail_for_url(url, str(created_at)) record = { "created_at": created_at, "title": title, "url": url, "person": person, "summary": summary, "person_color": PERSON_COLORS[random.randint(0, len(PERSON_COLORS)-1)], "is_image": is_image, "thumbnail": thumbnail } db.set(created_at, dumps(record)) db.close() return Response('{"What happened?": "MUDADA"}', mimetype=mimetype)
def top_things(db_file): urls = {} people = {} graph = {} db = DB() if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE): print "Could not open database. (Top things)" cur = db.cursor() cur.jump_back() while True: rec = cur.get(False) if not rec: break loaded_rec = loads(rec[1]) split = get_domain(loaded_rec) if urls.get(split, False) == False: urls[split] = 1 else: urls[split] = urls[split] + 1 person = loaded_rec['person'] if people.get(person, False) == False: people[person] = 1 else: people[person] = people[person] + 1 if split is not None and split is not "" and \ person is not None and person is not "": # Build a crazy relational graph out of my nosql data if graph.get(split, False) == False: graph[split] = {"is_person": False, "data": [person], "linked_to_count": 1} elif person not in graph[split]: graph[split]["data"].append(person) graph[split]["linked_to_count"] = graph[split]["linked_to_count"] + 1 if graph.get(person, False) == False: graph[person] = {"is_person": True, "data": [split]} elif split not in graph[person]: graph[person]["data"].append(split) cur.step_back() cur.disable() db.close() def get_one(x): return x[1] return (sorted(urls.items(), key=get_one, reverse=True), sorted(people.items(), key=get_one, reverse=True), graph)
def db_meta_info(): meta = {} db = DB() db_file = current_app.config['DB_FILE'] if not db.open("{0}".format(db_file), DB.OREADER): print "Could not open database (meta info)." meta["size"] = db.size() meta["count"] = db.count() db.close() return meta
def get_post_by_date(key, db_file): item = None db = DB() if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE): print "Could not open database." item = db.get(key) db.close() if item is not None: return loads(item) return dict()
def main(): db = DB() if not db.open("bayes.kch", DB.OWRITER | DB.OCREATE): return with open("word_tf.txt") as word_tf: for line in word_tf: line = line.strip() word, bayes_list = loads(line) print word if bayes_list: ar = array('I') ar.fromlist(lineiter(bayes_list)) db[word] = ar.tostring()
def purge(domain, genid): if request.remote_addr not in settings.ALLOW: return text_response("Not permitted.\n", 403) db = DB() if not db.open(settings.GENID_DATABASE, DB.OWRITER | DB.OCREATE): return text_response("Failed to purge: cannot open database.\n", 501) set_ok = db.set(domain, genid) db.close() if not set_ok: return text_response("Failed to purge: cannot set genid.\n", 501) else: return text_response("Purged <%s>\n" % (domain,))
def get_items_last_X_days(db_file, X, munge=True): dates = {} db = DB() if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE): print "Could not open database." X_days_ago = datetime.now() - timedelta(days=X) cur = db.cursor() cur.jump_back() while True: rec = cur.get(False) if not rec: break loaded = loads(rec[1]) unix = float(loaded['created_at']) time = datetime.fromtimestamp(unix) if time > X_days_ago: if munge: date_obj = date(year=time.year, month=time.month, day=time.day) else: date_obj = time # Javascript expects Date.UTC to spit out dates of a certain # length. day_unix = int(mktime(date_obj.timetuple())) * 1000 if dates.get(day_unix, False) == False: dates[day_unix] = {loaded["person"]: 1} else: relevant_dict = dates[day_unix] if relevant_dict.get(loaded["person"], False) == False: relevant_dict[loaded["person"]] = 1 else: relevant_dict[ loaded["person"]] = relevant_dict[loaded["person"]] + 1 else: break cur.step_back() cur.disable() db.close() return dates
def get_items_last_X_days(db_file, X, munge=True): dates = {} db = DB() if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE): print "Could not open database." X_days_ago = datetime.now() - timedelta(days=X) cur = db.cursor() cur.jump_back() while True: rec = cur.get(False) if not rec: break loaded = loads(rec[1]) unix = float(loaded['created_at']) time = datetime.fromtimestamp(unix) if time > X_days_ago: if munge: date_obj = date(year=time.year, month=time.month, day=time.day) else: date_obj = time # Javascript expects Date.UTC to spit out dates of a certain # length. day_unix = int(mktime(date_obj.timetuple()))*1000 if dates.get(day_unix, False) == False: dates[day_unix] = {loaded["person"]: 1} else: relevant_dict = dates[day_unix] if relevant_dict.get(loaded["person"], False) == False: relevant_dict[loaded["person"]] = 1 else: relevant_dict[loaded["person"]] = relevant_dict[loaded["person"]] + 1 else: break; cur.step_back() cur.disable() db.close() return dates
def get_last_items(db_file, pages=1): items = [] db = DB() if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE): print "Could not open database." cur = db.cursor() cur.jump_back() while len(items) < (pages * FILTER_MAX): rec = cur.get(False) if not rec: break items.append(rec) cur.step_back() cur.disable() db.close() return items
def get_all_items(db_file): items = [] db = DB() if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE): print "Could not open database." cur = db.cursor() cur.jump() while True: rec = cur.get(False) if not rec: break items.append(rec) cur.step() cur.disable() db.close() sorted_items_for_viewing = [loads(item[1]) for item in items] return sorted_items_for_viewing
def get_page_count(item_filter = lambda x: True): count = 0 db = DB() db_file = current_app.config['DB_FILE'] if not db.open("{0}".format(db_file), DB.OREADER | DB.OWRITER | DB.OCREATE): print "Could not open database (get_page_count). Error: {}".format(db.error()) cur = db.cursor() cur.jump_back() while True: rec = cur.get(False) if not rec: break if item_filter(rec): count = count + 1 cur.step_back() cur.disable() db.close() return count / FILTER_MAX
def gen_thumbnails(db_file): db = DB() if not db.open("{0}".format(db_file), DB.OREADER | DB.OWRITER): sys.exit(1) cur = db.cursor() cur.jump_back() while True: rec = cur.get(False) if not rec: break loaded = loads(rec[1]) is_image = loaded["url"].lower().endswith( ("jpg", "jpeg", "gif", "png")) if is_image: print "Thumbnailing {}".format(loaded["url"]) loaded["is_image"] = True try: thumbnail = gen_thumbnail_for_url(loaded["url"], rec[0]) except IOError as e: print "IOError: {}".format(e) print "Save result: {}".format(cur.set_value(dumps(loaded))) cur.step_back() continue if thumbnail: loaded["thumbnail"] = thumbnail print "Thumbnailed {}".format(loaded["url"]) print "Save result: {}".format(cur.set_value(dumps(loaded))) cur.step_back() cur.disable() db.close() return True
def gen_thumbnails(db_file): db = DB() if not db.open("{0}".format(db_file), DB.OREADER | DB.OWRITER): sys.exit(1) cur = db.cursor() cur.jump_back() while True: rec = cur.get(False) if not rec: break loaded = loads(rec[1]) is_image = loaded["url"].lower().endswith(("jpg", "jpeg", "gif", "png")) if is_image: print "Thumbnailing {}".format(loaded["url"]) loaded["is_image"] = True try: thumbnail = gen_thumbnail_for_url(loaded["url"], rec[0]) except IOError as e: print "IOError: {}".format(e) print "Save result: {}".format(cur.set_value(dumps(loaded))) cur.step_back() continue if thumbnail: loaded["thumbnail"] = thumbnail print "Thumbnailed {}".format(loaded["url"]) print "Save result: {}".format(cur.set_value(dumps(loaded))) cur.step_back() cur.disable() db.close() return True
def get_items_on_page(page, db_file): item_iter = 0 items = [] db = DB() if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE): print "Could not open database." cur = db.cursor() cur.jump_back() while len(items) < FILTER_MAX: rec = cur.get(False) if not rec: break if item_iter >= (FILTER_MAX * page): items.append(rec) item_iter = item_iter + 1 cur.step_back() cur.disable() db.close() return items
def main(): db_file = argv[1] username = argv[2] if not db_file and not username: print "Need db_file and username." return -1 db = DB() if not db.open("{0}".format(db_file), DB.OWRITER): print "Could not open database." return -1 all_keys = [] cur = db.cursor() cur.jump() while True: rec = cur.get(False) if not rec: break loaded = loads(rec[1]) if loaded["person"] == username: all_keys.append(cur.get_key()) cur.step() cur.disable() print "Found {} records.".format(len(all_keys)) for key in all_keys: print "Pending {}...".format(key) if len(argv) > 3 and argv[3] == '--delete': print "Removing {}...".format(key) if not db.remove(key): print "Could not remove key: {}".format(db.error()) db.close()
def get_page_count(item_filter=lambda x: True): count = 0 db = DB() db_file = current_app.config['DB_FILE'] if not db.open("{0}".format(db_file), DB.OREADER | DB.OWRITER | DB.OCREATE): print "Could not open database (get_page_count). Error: {}".format( db.error()) cur = db.cursor() cur.jump_back() while True: rec = cur.get(False) if not rec: break if item_filter(rec): count = count + 1 cur.step_back() cur.disable() db.close() return count / FILTER_MAX
class DataStorage(object): """ Parent class for RowData and KeyValueData. """ def __init__(self, filename, headers=None): self.filename = filename self.ext = os.path.splitext(filename)[1] self.headers = headers if os.path.exists(self.filename): self.init_read() else: self.init_write() def init_write(self): self.mode = "write" if self.ext == ".csv": self._data_file = open(self.filename, "wb") self._writer = csv.writer(self._data_file) if self.headers: self._writer.writerow(self.headers) elif self.ext == ".json": self._storage = {} elif self.ext == ".kch": from kyotocabinet import DB self._storage = DB() if not self._storage.open(self.filename, DB.OWRITER | DB.OCREATE): msg = "Error opening kyotocabinet db: %s" % ( self._storage.error()) raise dexy.exceptions.UserFeedback(msg) elif self.ext == ".sqlite3": self.init_write_sqlite3() else: raise dexy.exceptions.UserFeedback("unsupported extension %s" % self.ext) def init_read(self): self.mode = "read" if self.ext == ".csv": self._file = open(self.filename, "rb") elif self.ext == ".json": with open(self.filename, "rb") as f: self._storage = json.load(f) elif self.ext == ".kch": from kyotocabinet import DB self._storage = DB() self._storage.open(self.filename, DB.OREADER) elif self.ext == ".sqlite3": import sqlite3 self._storage = sqlite3.connect(self.filename) self._cursor = self._storage.cursor() else: raise dexy.exceptions.UserFeedback("unsupported extension %s" % self.ext) def save(self): if self.ext == ".csv": self._data_file.close() elif self.ext == ".json": with open(self.filename, "wb") as f: import json json.dump(self._storage, f) elif self.ext == ".kch": if not self._storage.close(): raise dexy.exceptions.UserFeedback(self._storage.error()) elif self.ext == ".sqlite3": self._storage.commit() self._cursor.close() else: raise dexy.exceptions.UserFeedback("unsupported extension %s" % self.ext)
from kyotocabinet import DB import sys,time #format datetime def format_datetime(timestamp): return time.strftime('%Y.%m.%d @ %H:%M',time.localtime(timestamp)) #创建数据库对象 db=DB() # open the database if not db.open("../data/db.kch", DB.OWRITER | DB.OCREATE): print >>sys.stderr, "open error: " + str(db.error()) cid='1' #build for marks form if not db.set("mark:"+cid+":markid", "1") or\ not db.set("mark:"+cid+":userid", "1") or\ not db.set("mark:"+cid+":boardid", "1") or\ not db.set("mark:"+cid+":fileid", "1") or\ not db.set("mark:"+cid+":description", "mark的内容描述") or\ not db.set("mark:"+cid+":content", "哇。。阳光暖暖的,好惬意的。") or\ not db.set("mark:"+cid+":ups", "100") or\ not db.set("mark:"+cid+":downs", "10") or\ not db.set("mark:"+cid+":hits", "110") or\ not db.set("mark:"+cid+":order", "1") or\ not db.set("maks:"+cid+":createdata", int(time.time())) or\ not db.set("maks:"+cid+":commentcount", "8") or\
def get_user_stats(username, db_file): item = { "username": username, "aliases": [], "total_posts": 0, "domains": {}, "first_post_date": None, "first_post_date_unix": None, "most_recent_post": None, "most_recent_post_unix": 0, "average_posts_per_hour": 0.0, "average_posts_per_day": 0.0, "average_posts_per_week": 0.0 } db = DB() if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE): print "Could not open database." cur = db.cursor() cur.jump() while True: rec = cur.get(False) if not rec: break loaded_rec = loads(rec[1]) if loaded_rec['person'] != username: cur.step() continue # Looks like this is a post by the user we're looking for split = get_domain(loaded_rec) if item['domains'].get(split, False) == False: item['domains'][split] = 1 else: item['domains'][split] = item['domains'][split] + 1 if item['first_post_date_unix'] is None: item['first_post_date_unix'] = loaded_rec['created_at'] if item['most_recent_post_unix'] < loaded_rec['created_at']: item['most_recent_post_unix'] = loaded_rec['created_at'] item['total_posts'] = item['total_posts'] + 1 cur.step() cur.disable() db.close() # Clean up everything first_time = None if item['first_post_date_unix'] is not None: unix = float(item['first_post_date_unix']) first_time = datetime.fromtimestamp(unix) item['first_post_date'] = first_time.isoformat() recent_time = None if item['most_recent_post_unix'] is not None: unix = float(item['most_recent_post_unix']) recent_time = datetime.fromtimestamp(unix) item['most_recent_post'] = recent_time.isoformat() if first_time and recent_time: delta = recent_time - first_time item['user_age_days'] = delta.days item['user_age_seconds'] = delta.total_seconds() item['average_posts_per_hour'] = item['total_posts'] / (delta.total_seconds() / 60.0) item['average_posts_per_day'] = item['total_posts'] / (delta.total_seconds() / 60.0 / 24.0) item['average_posts_per_week'] = item['total_posts'] / (delta.total_seconds() / 60.0 / 24.0 / 7.0) return item
def convert2array(dict_value): ''' >>> convert2array({1:0.1,2:0.3}) array('L', [1L, 429496729L, 2L, 1288490188L]) ''' result_list = [] for k, v in dict_value: result_list.extend([k, int(v * MAX_INT)]) result = array('L', result_list) return result def convert2dict(array_l): ''' >>> convert2dict([1L, 429496729L, 2L, 1288490188L]) {1:0.1,2:0.3} ''' return [(array_l[i], array_l[i + 1]) for i in range(len(array_l)) if i % 2 == 0] if __name__ == '__main__': #import doctest #doctest.testmod() from kyotocabinet import DB db = DB() db.open('/mnt/zdata/kyoto/kyoto.kch', DB.OREADER) print db.get(15439)
class DataStorage(object): """ Parent class for RowData and KeyValueData. """ def __init__(self, filename, headers=None): self.filename = filename self.ext = os.path.splitext(filename)[1] self.headers = headers if os.path.exists(self.filename): self.init_read() else: self.init_write() def init_write(self): self.mode = "write" if self.ext == ".csv": self._data_file = open(self.filename, "wb") self._writer = csv.writer(self._data_file) if self.headers: self._writer.writerow(self.headers) elif self.ext == ".json": self._storage = {} elif self.ext == ".kch": from kyotocabinet import DB self._storage = DB() if not self._storage.open(self.filename, DB.OWRITER | DB.OCREATE): msg = "Error opening kyotocabinet db: %s" % (self._storage.error()) raise dexy.commands.UserFeedback(msg) elif self.ext == ".sqlite3": self.init_write_sqlite3() else: raise dexy.commands.UserFeedback("unsupported extension %s" % self.ext) def init_read(self): self.mode = "read" if self.ext == ".csv": self._file = open(self.filename, "rb") elif self.ext == ".json": with open(self.filename, "rb") as f: self._storage = json.load(f) elif self.ext == ".kch": from kyotocabinet import DB self._storage = DB() self._storage.open(self.filename, DB.OREADER) elif self.ext == ".sqlite3": import sqlite3 self._storage = sqlite3.connect(self.filename) self._cursor = self._storage.cursor() else: raise dexy.commands.UserFeedback("unsupported extension %s" % self.ext) def save(self): if self.ext == ".csv": self._data_file.close() elif self.ext == ".json": with open(self.filename, "wb") as f: import json json.dump(self._storage, f) elif self.ext == ".kch": if not self._storage.close(): raise dexy.commands.UserFeedback(self._storage.error()) elif self.ext == ".sqlite3": self._storage.commit() self._cursor.close() else: raise dexy.commands.UserFeedback("unsupported extension %s" % self.ext)
def get_user_stats(username, db_file): item = { "username": username, "aliases": [], "total_posts": 0, "domains": {}, "first_post_date": None, "first_post_date_unix": None, "most_recent_post": None, "most_recent_post_unix": 0, "average_posts_per_hour": 0.0, "average_posts_per_day": 0.0, "average_posts_per_week": 0.0 } db = DB() if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE): print "Could not open database." cur = db.cursor() cur.jump() while True: rec = cur.get(False) if not rec: break loaded_rec = loads(rec[1]) if loaded_rec['person'] != username: cur.step() continue # Looks like this is a post by the user we're looking for split = get_domain(loaded_rec) if item['domains'].get(split, False) == False: item['domains'][split] = 1 else: item['domains'][split] = item['domains'][split] + 1 if item['first_post_date_unix'] is None: item['first_post_date_unix'] = loaded_rec['created_at'] if item['most_recent_post_unix'] < loaded_rec['created_at']: item['most_recent_post_unix'] = loaded_rec['created_at'] item['total_posts'] = item['total_posts'] + 1 cur.step() cur.disable() db.close() # Clean up everything first_time = None if item['first_post_date_unix'] is not None: unix = float(item['first_post_date_unix']) first_time = datetime.fromtimestamp(unix) item['first_post_date'] = first_time.isoformat() recent_time = None if item['most_recent_post_unix'] is not None: unix = float(item['most_recent_post_unix']) recent_time = datetime.fromtimestamp(unix) item['most_recent_post'] = recent_time.isoformat() if first_time and recent_time: delta = recent_time - first_time item['user_age_days'] = delta.days item['user_age_seconds'] = delta.total_seconds() item['average_posts_per_hour'] = item['total_posts'] / ( delta.total_seconds() / 60.0) item['average_posts_per_day'] = item['total_posts'] / ( delta.total_seconds() / 60.0 / 24.0) item['average_posts_per_week'] = item['total_posts'] / ( delta.total_seconds() / 60.0 / 24.0 / 7.0) return item
return izip(a, b) #for k,v in pairwise(range(10)): # print k,v def convert2array(dict_value): ''' >>> convert2array({1:0.1,2:0.3}) array('L', [1L, 429496729L, 2L, 1288490188L]) ''' result_list = [] for k,v in dict_value: result_list.extend([k,int(v*MAX_INT)]) result = array('L',result_list) return result def convert2dict(array_l): ''' >>> convert2dict([1L, 429496729L, 2L, 1288490188L]) {1:0.1,2:0.3} ''' return [ (array_l[i],array_l[i+1]) for i in range(len(array_l)) if i%2==0] if __name__=='__main__': #import doctest #doctest.testmod() from kyotocabinet import DB db = DB() db.open('/mnt/zdata/kyoto/kyoto.kch', DB.OREADER) print db.get(15439)
class KyotoIndex(BinaryIndex): ''' Kyoto Cabinet index. Notably this uses a B+ tree for the index and thus one can traverse from one key forwards and backwards, which supports the coming Store synchronisation processes. ''' NAME = 'kyoto' SUFFIX = 'kct' def __init__(self, nmdbpathbase): super().__init__(nmdbpathbase) self._kyoto = None @classmethod def is_supported(cls): ''' Test whether this index class is supported by the Python environment. ''' # pylint: disable=import-error,unused-import,import-outside-toplevel try: import kyotocabinet except ImportError: return False return True def startup(self): ''' Open the index. ''' # pylint: disable=import-error,import-outside-toplevel from kyotocabinet import DB self._kyoto = DB() self._kyoto.open(self.path, DB.OWRITER | DB.OCREATE) def shutdown(self): ''' Close the index. ''' self._kyoto.close() self._kyoto = None def flush(self): ''' Flush pending updates to the index. ''' try: self._kyoto.synchronize(hard=False) except TypeError: self._kyoto.synchronize() def __len__(self): return self._kyoto.count() def __contains__(self, key): return self._kyoto.check(key) >= 0 def __getitem__(self, key): binary_entry = self._kyoto.get(key) if binary_entry is None: raise KeyError(key) return binary_entry def __setitem__(self, key, binary_entry): self._kyoto[key] = binary_entry def keys(self, *, start_hashcode=None): ''' Generator yielding the keys from the index in order starting with optional `start_hashcode`. Parameters: * `start_hashcode`: the starting key; if missing or `None`, iteration starts with the first key in the index ''' cursor = self._kyoto.cursor() if start_hashcode is not None: cursor.jump(start_hashcode) yield cursor.get_key() while cursor.step(): yield cursor.get_key() cursor.disable() sorted_keys = keys __iter__ = keys
class FeatureSelector( Frontend ): def __init__( self, fn, mode ): Frontend.__init__( self, fn, mode ); self._kdbfn = None; self._kdb = None; self._ldbdn = None; self._ldb = None; self._len_c = None; self._len_b = None; self._len_x = None; self._ic = None; self._icbp = None; self._needs_initialization = True; self._core_dims = set(); self._satellite_dims = set(); self._removed_dims = set(); self._remove_c = set(); self._remove_b = set(); self._remove_x = set(); self.bypass_c = False; self.bypass_b = False; self.bypass_x = False; def __enter__( self ): if self._mode == "r": with open( self._fn, "rb" ) as f: state = pickle_load( f ); self._len_c = state[ "c" ]; self._len_b = state[ "b" ]; self._len_x = state[ "x" ]; self._lenrow = self._len_c + self._len_b + self._len_x; self._ic = state[ "ic" ]; self._icbp = state[ "icbp" ]; if self._mode == "w": with NamedTemporaryFile() as tmpfn: self._kdbfn = tmpfn.name + '.kch'; self._kdb = KDB(); try: assert self._kdb.open( self._kdbfn, KDB.OWRITER | KDB.OCREATE ); except: print( str( self._kdb.error() ) ); raise; with TemporaryDirectory() as tmpdirname: self._ldbdn = tmpdirname; self._ldb = LDB( self._ldbdn, create_if_missing=True ); return self; def __exit__( self, exc_type, exc_value, traceback ): assert Frontend.__exit__( self, exc_type, exc_value, traceback ) == False; if self._ldb is not None: sleep( 3.0 ); self._ldb.close() if self._ldbdn is not None: rmtree( self._ldbdn ); if self._kdb is not None: try: assert self._kdb.close(); except: print( str( self._kdb.error() ) ); raise; if self._kdbfn is not None: remove( self._kdbfn ); def train( self, row ): ( y, c, b, x ) = row; if self._len_c is None: self._len_c = len(c); assert self._len_c == len(c); if self._len_b is None: self._len_b = len(b); assert self._len_b == len(b); if self._len_x is None: self._len_x = len(x); assert self._len_x == len(x); row = c + b + x; if Frontend.train( self, row ): return True; keyfmt = '>IIIII'; for i in range( 0, self._lenrow ): for j in range( 0, self._lenrow ): if ( i >= j ) and ( not ( i == self._lenrow-1 ) ): continue; key = pack( keyfmt, i, j, y, row[i], row[j] ); try: assert self._kdb.increment( key, 1, 0 ); except: print( str(self._kdb.error()) ); raise; def _stats( self, cnt_by_a, cnt_by_b, cnt_by_ab ): h_a = 0.0; h_b = 0.0; h_ab = 0.0; for ( val_a, cnt ) in cnt_by_a.items(): p = float(cnt) / float(self._rowcount); if p > 0.0: h_a -= p * log( p, 2.0 ); for ( val_b, cnt ) in cnt_by_b.items(): p = float(cnt) / float(self._rowcount); if p > 0.0: h_b -= p * log( p, 2.0 ); for( (val_a,val_b), cnt ) in cnt_by_ab.items(): p = float(cnt) / float(self._rowcount); if p > 0.0: h_ab -= p * log( p, 2.0 ); if h_a == 0.0: return 1.0; if h_b == 0.0: return 1.0; mi = h_a + h_b - h_ab; return ( mi / min( h_a, h_b ), h_a, h_b, h_ab, mi ); def _get_info_content_by_dimension( self, i ): keyfmt = '>IIIII'; valfmt = '>Q'; j = None; cnt_by_a = {}; cnt_by_b = {}; cnt_by_ab = {}; total = 0; with self._ldb.iterator() as it: it.seek( pack( keyfmt, i,0,0,0,0 ) ); for ( key, val ) in it: key = unpack( keyfmt, key ); val = unpack( valfmt, val )[ 0 ]; if not ( key[0] == i ): break; if j is None: j = key[1]; if not ( key[1] == j ): break; # key[2] is the y-value a = key[2]; # key[3] is the value for the i-th dimension b = key[3]; cnt_by_ab[ (a,b) ] = cnt_by_ab.get( (a,b), 0 ) + val; cnt_by_a[ a ] = cnt_by_a.get( a, 0 ) + val; cnt_by_b[ b ] = cnt_by_b.get( b, 0 ) + val; total += val; try: assert total == self._rowcount; except: print( i, j, total, self._rowcount ); raise; return self._stats( cnt_by_a, cnt_by_b, cnt_by_ab ); def _get_info_content_by_pair( self, i, j ): keyfmt = '>IIIII'; valfmt = '>Q'; cnt_by_a = {}; cnt_by_b = {}; cnt_by_ab = {}; total = 0; with self._ldb.iterator() as it: it.seek( pack( keyfmt, i,j,0,0,0 ) ); for ( key, val ) in it: key = unpack( keyfmt, key ); val = unpack( valfmt, val )[ 0 ]; if not ( ( key[0] == i ) and ( key[1] == j ) ): break; # key[2] is the y-value, key[3] the i-th value for the i-th dim a = ( key[2], key[3] ); # key[2] is the y-value, key[4] the i-th value for the j-th dim b = ( key[2], key[4] ); assert (a,b) not in cnt_by_ab; cnt_by_ab[ (a,b) ] = cnt_by_ab.get( (a,b), 0 ) + val; cnt_by_a[ a ] = cnt_by_a.get( a, 0 ) + val; cnt_by_b[ b ] = cnt_by_b.get( b, 0 ) + val; total += val; assert total == self._rowcount; return self._stats( cnt_by_a, cnt_by_b, cnt_by_ab ); def _finalize( self ): assert Frontend._finalize( self ) is None; if False: print( "unique combinations = ", self._kdb.count() ); keyfmt = '>IIIII'; valfmt = '>Q'; c = self._kdb.cursor(); c.jump(); gt2 = 0; gt4 = 0; gt8 = 0; gt16 = 0; gt32 = 0; while True: r = c.get( True ); if not r: break; self._ldb.put( r[0], r[1] ); key = unpack( keyfmt, r[0] ); val = unpack( valfmt, r[1] )[ 0 ]; if val > 2: gt2 += 1; if val > 4: gt4 += 1; if val > 8: gt8 += 1; if val > 16: gt16 += 1; if val > 32: gt32 += 1; if False: print( gt2, gt4, gt8, gt16, gt32 ); self._ic = {}; for i in range( 0, self._lenrow ): self._ic[ i ] = self._get_info_content_by_dimension( i ); self._icbp = {}; for i in range( 0, self._lenrow ): for j in range( 0, self._lenrow ): if i >= j: continue; self._icbp[ (i,j) ] = self._get_info_content_by_pair( i, j ); self._state \ = { "ic": self._ic, "icbp": self._icbp, "c": self._len_c, "b": self._len_b, "x": self._len_x }; def _fmt_dim( self, d_ ): d = None; if d_ < self._len_c: d = "c" + str( d_ ); elif d_ < self._len_c + self._len_b: d = "b" + str( d_ - self._len_c ); elif d_ < self._len_c + self._len_b + self._len_x: d = "x" + str( d_ - self._len_c - self._len_b ); else: assert False; return "{:d}({:s})".format( d_, d ); def _init( self ): self._needs_initialization = False; if False: for i in sorted( self._ic ): (corr,h_a,h_b,h_ab,mi) = self._ic[ i ]; print( "{:s} {:1.4f} {:1.4f} {:1.4f} {:1.4f} {:1.4f}"\ .format( self._fmt_dim( i ), corr, h_a, h_b, h_ab, mi ) ); for (i,j) in sorted( self._icbp ): (corr,h_a,h_b,h_ab,mi) = self._icbp[ (i,j) ]; print( "{:s} {:s} {:1.4f} {:1.4f} {:1.4f} {:1.4f} {:1.4f}"\ .format( self._fmt_dim( i ), self._fmt_dim( j ), corr, h_a, h_b, h_ab, mi ) ); entropy \ = [ ( h_ab, i ) \ for ( i, (corr,h_a,h_b,h_ab,mi) ) in self._ic.items() ]; output_correlation \ = [ ( corr, i ) \ for ( i, (corr,h_a,h_b,h_ab,mi) ) in self._ic.items() ]; self._core_dims = set(); self._core_dims \ |= { i \ for ( h_ab, i ) \ in sorted( entropy, reverse=True )[ :5 ] }; self._core_dims \ |= { i \ for ( h_ab, i ) \ in sorted( output_correlation, reverse=True )[ :3 ] }; if True: print( "core = ", " ".join([ self._fmt_dim(d) for d in self._core_dims ]) ); self._satellite_dims = set(); for core_dim in self._core_dims: satellite_dim = None; satellite_dim_c = None; satellite_dim_stats = None; for ( (i,j), (corr,h_a,h_b,h_ab,mi) ) in self._icbp.items(): if corr <= 0.5: continue; other_dim = None; if i == core_dim: other_dim = j; elif j == core_dim: other_dim = i; else: continue; if ( satellite_dim_c is None ) or ( corr > satellite_dim_c ): satellite_dim = other_dim; satellite_dim_c = corr; satellite_dim_stats = (corr,h_a,h_b,h_ab,mi); if satellite_dim is not None: self._satellite_dims.add( satellite_dim ); if False: print( '->', self._fmt_dim(core_dim), self._fmt_dim(satellite_dim) ); print( "{:1.4f} {:1.4f} {:1.4f} {:1.4f} {:1.4f}"\ .format( *(corr,h_a,h_b,h_ab,mi) ) ); if True: print( "satellite = ", " ".join([ self._fmt_dim(d) for d in self._satellite_dims ]) ); self._removed_dims = set(); for i in self._ic: if i not in self._core_dims and i not in self._satellite_dims: self._removed_dims.add( i ); if True: print( "removed = ", " ".join([ self._fmt_dim(d) for d in self._removed_dims ]) ); for d_ in self._removed_dims: if d_ < self._len_c: self._remove_c.add( d_ ); elif d_ < self._len_c + self._len_b: self._remove_b.add( d_ - self._len_c ); elif d_ < self._len_c + self._len_b + self._len_x: self._remove_x.add( d_ - self._len_c - self._len_b ); else: assert False; def apply_c( self, c ): if self.bypass_c: return c; if self._needs_initialization: self._init(); c_ = []; for ( i, cval ) in enumerate( c ): if not i in self._remove_c: c_.append( cval ); return c_; def apply_b( self, b ): if self.bypass_b: return b; if self._needs_initialization: self._init(); b_ = []; for ( i, bval ) in enumerate( b ): if not i in self._remove_b: b_.append( bval ); return b_; def apply_x( self, x ): if self.bypass_x: return x; if self._needs_initialization: self._init(); x_ = []; for ( i, xval ) in enumerate( x ): if not i in self._remove_x: x_.append( xval ); return x_; def __call__( self, row ): if self._needs_initialization: self._init(); ( y, c, b, x ) = row; y_ = y; return \ ( y_, self.apply_c( c ), self.apply_b( b ), self.apply_x( x ) );
from kyotocabinet import DB from collections import defaultdict from array import array from zkit.zitertools import chunkiter from operator import itemgetter from zdata.tag.name2id import NAME2ID from zkit.txt_cleanup import sp_txt ID2NAME = defaultdict(list) for name, id in NAME2ID.iteritems(): ID2NAME[id].append(name) db_tag_bayes = DB() db_tag_bayes.open(join(ZDATA_PATH, "data/bayes.kch"), DB.OREADER) def tag_id_rank_list_by_txt(txt): txt = txt.lower() tag_id_list_rank = defaultdict(int) for word, rank in tf_idf_seg_txt(txt): #print word ars = db_tag_bayes.get(word) if ars: ar = array('I') ar.fromstring(ars) #print len(ar) #print db_tag_bayes[word] #print word, ar for tag_id, bayes in chunkiter(ar, 2):
class KyotoCabinetGraph(BaseGraph): def __init__(self, path): # create the database object self._path = path self._db = DB() # open the database if not self._db.open(path, DB.OREADER | DB.OWRITER | DB.OCREATE): raise GrapheekDataKyotoCabinetInitFailureException( str(self._db.error())) super(KyotoCabinetGraph, self).__init__() self._ensure_prepared() self._closed = False # Start method overriding : def _db_close(self): if not self._closed: self._db.close() def _transaction_begin(self): self._db.begin_transaction() return True def _transaction_commit(self, txn): self._db.end_transaction(True) def _transaction_rollback(self, txn): self._db.end_transaction(False) def _has_key(self, key): return self._db.check(key) >= 0 def _get(self, txn, key): raw_data = self._db.get(key) if raw_data is None: return UNDEFINED # Not returning None, as None is a valid value return msgpack.loads(raw_data, encoding='utf8') def _bulk_get(self, txn, keys): result = {} key_raw_datas = self._db.get_bulk(keys) for key, raw_data in list(key_raw_datas.items()): if PYTHON2: # pragma : no cover k = key else: # pragma : no cover k = str(key, encoding='utf8') result[k] = msgpack.loads(raw_data, encoding='utf8') return result def _set(self, txn, key, value): res = self._db.set(key, msgpack.dumps(value, encoding='utf8')) if not (res): # pragma : no cover raise GrapheekDataKyotoCabinetException( 'KyotoCabinet : error while saving') return res def _bulk_set(self, txn, updates): dic = {} for key, value in list(updates.items()): dic[key] = msgpack.dumps(value, encoding='utf8') res = self._db.set_bulk(dic) if res == -1: # pragma : no cover raise GrapheekDataKyotoCabinetException( 'KyotoCabinet : error while saving') return res def _remove(self, txn, key): # Contrary to LocalMemoryGraph implementation, it is not needed to wrap # key removal in try.. except because KyotoCabinet only send "False" # when key does not exist # Thus ... _removemethod is idempotent (cf LocalMemoryGraph _remove method comment) self._db.remove(key) def _bulk_remove(self, txn, keys): res = self._db.remove_bulk(list(keys)) if res == -1: # pragma : no cover raise GrapheekDataKyotoCabinetException( 'KyotoCabinet : error while saving') return res def _remove_prefix(self, txn, prefix): keys = self._db.match_prefix(prefix) self._db.remove_bulk(keys) # overriding list methods # looks like a bucket of hacks, and yes indeed it is :) # btw, it REALLY improves performance if we compare to default implementation which, # in the case of KyotoCabinet would involve msgpack deserialization followed by a serialization def _init_lst(self, txn, key): res = self._db.set(key, '') if not (res): # pragma : no cover raise GrapheekDataKyotoCabinetException( 'KyotoCabinet : error while saving') return res def _get_lst(self, txn, key): value = self._db.get(key) if value is None: return UNDEFINED # look _append_to_lst code below to understand why a split is done # And why resulting list is sliced from 1 if PYTHON2: # pragma : no cover return list(map(int, value.split('|')[1:])) return list(map( int, str(value, encoding='utf8').split('|')[1:])) # pragma : no cover def _set_lst(self, txn, key, values): newval = '|'.join([str(value) for value in values]) res = self._db.set(key, '|' + newval) if not (res): # pragma : no cover raise GrapheekDataKyotoCabinetException( 'KyotoCabinet : error while saving') return res def _bulk_get_lst(self, txn, keys): dic_values = self._db.get_bulk(keys) results = [] for key in keys: if PYTHON2: # pragma : no cover values = dic_values.get(key, UNDEFINED) else: # pragma : no cover values = dic_values.get(bytes(key, encoding='utf8'), UNDEFINED) if values == UNDEFINED: results.append([]) else: if PYTHON2: # pragma : no cover results.append(list(map(int, values.split('|')[1:]))) else: # pragma : no cover results.append( list( map(int, str(values, encoding='utf8').split('|')[1:]))) return results def _append_to_lst(self, txn, key, value): self._db.append(key, '|' + str(value)) def _bulk_append_to_lst(self, txn, key, values): newval = '|'.join([str(value) for value in values]) self._db.append(key, '|' + newval) def _remove_from_lst(self, txn, key, value): old = self._db.get(key) if not PYTHON2: # pragma : no cover old = str(old, encoding='utf8') # Caution : we are only removing ONE occurence # This is voluntary # For instance, it lst contains neighbour node, we need to remove only one occurence # cause current entity and neighbour node can be linked multiple time new = old.replace('|%s' % value, '', 1) if new == old: raise ValueError("list.remove(x): x not in list") res = self._db.set(key, new) if not (res): # pragma : no cover raise GrapheekDataKyotoCabinetException( 'KyotoCabinet : error while saving') return res def _bulk_remove_from_lst(self, txn, key, values): assert (len(values)) old = self._db.get(key) if PYTHON2: # pragma : no cover new = old else: # pragma : no cover new = str(old, encoding='utf8') for value in values: new = new.replace('|%s' % value, '', 1) if new == old: # pragma : no cover raise ValueError("list.remove(x): x not in list") res = self._db.set(key, new) if not (res): # pragma : no cover raise GrapheekDataKyotoCabinetException( 'KyotoCabinet : error while saving') return res
class BKNNModel( Model ): def __init__( self, fn, mode, catfe, binfe, contfe, fdisc, fsel, kval ): Model.__init__( self, fn, mode, catfe, binfe, contfe, fdisc, fsel ); self._kval = kval; self._fn_cdata = self._fn; self._fn_ddata = self._fn.replace( '.kch', '-discrete.kch' ); self._fn_meta = self._fn.replace( '.kch', '-meta.pickle' ); self._fn_icov = self._fn.replace( '.kch', '-icov.pickle' ); self._cdata = None; self._ddata = None; self._len_c = None; self._len_b = None; self._len_x = None; self._rowcount = None; self._total_pos = None; self._total_neg = None; self._icov = None; self._co = None; self._sample_y = []; self._sample_c = []; self._sample_b = []; self._sample_x = []; self._sample_x_ = []; self._needs_finalization = False; self._needs_initialization = True; self._dmarginals = {}; self._dscores = {}; self._sparse_points = 0; self._bias = None; def __enter__( self ): self._cdata = DB(); self._ddata = DB(); try: if self._mode == "r": assert self._cdata.open( self._fn_cdata, DB.OREADER ); elif self._mode == "w": if isfile( self._fn_cdata ): remove( self._fn_cdata ); assert self._cdata.open( self._fn_cdata, DB.OWRITER | DB.OCREATE ); else: assert False; except: if self._cdata is not None: print( str( self._cdata.error() ) ); raise; try: if self._mode == "r": assert self._ddata.open( self._fn_ddata, DB.OREADER ); elif self._mode == "w": if isfile( self._fn_ddata ): remove( self._fn_ddata ); assert self._ddata.open( self._fn_ddata, DB.OWRITER | DB.OCREATE ); else: assert False; except: if self._ddata is not None: print( str( self._ddata.error() ) ); raise; if self._mode == "r": with open( self._fn_meta, 'rb' ) as f: r = pickle_load( f ); self._len_c = r[ "c" ]; self._len_b = r[ "b" ]; self._len_x = r[ "x" ]; self._co = r[ "co" ]; with open( self._fn_icov, 'rb' ) as f: self._icov = pickle_load( f ); return self; def __exit__( self, exc_type, exc_value, traceback ): ex_w_exc = False; ex_w_exc = ex_w_exc or ( exc_type is not None ); ex_w_exc = ex_w_exc or ( exc_value is not None ); ex_w_exc = ex_w_exc or ( traceback is not None ); if ( not ex_w_exc ) and ( self._mode == "w" ): if self._needs_finalization: self._finalize(); with open( self._fn_meta, 'wb' ) as f: r = { "c": self._len_c, "b": self._len_b, "x": self._len_x, "co": self._co }; pickle_dump( r, f ); with open( self._fn_icov, 'wb' ) as f: pickle_dump( self._icov, f ); if self._cdata is not None: try: assert self._cdata.close(); except: print( str( self._cdata.error() ) ); raise; self._cdata = None; if self._ddata is not None: try: assert self._ddata.close(); except: print( str( self._ddata.error() ) ); raise; self._ddata = None; if ex_w_exc and ( self._mode == "w" ): if isfile( self._fn_cdata ): remove( self._fn_cdata ); if isfile( self._fn_ddata ): remove( self._fn_ddata ); if isfile( self._fn_meta ): remove( self._fn_meta ); if isfile( self._fn_icov ): remove( self._fn_icov ); return False; def train( self, row ): self._needs_finalization = True; ( y, c, b, x ) = row; c = self._fsel.apply_c( self._catfe( c ) ); b = self._fsel.apply_b( self._binfe( b ) ); x = self._contfe( x ); x_ = self._fdisc( x ); x = self._fsel.apply_x( x ); x_ = self._fsel.apply_x( x_ ); if False: print( y, c, b, x, x_ ); if self._len_c is None: self._len_c = len(c); assert self._len_c == len(c); if self._len_b is None: self._len_b = len(b); assert self._len_b == len(b); if self._len_x is None: self._len_x = len(x); assert self._len_x == len(x); if self._rowcount is None: self._rowcount = 0; self._rowcount += 1; dkeyfmt = '>' + ( 'I' * ( 1 + self._len_c + self._len_b ) ); self._ddata.increment( pack( dkeyfmt, y, *(c+b) ), 1, 0 ); ckeyfmt = '>' + ( 'I' * len(x) ); cvalfmt = '>I' + ( 'f' * len(x) ); self._cdata.append( pack( ckeyfmt, *x_ ), pack( cvalfmt, y, *x ) ); if len( self._sample_x ) < 50000: assert len( self._sample_x ) == len( self._sample_y ); assert len( self._sample_x ) == len( self._sample_c ); assert len( self._sample_x ) == len( self._sample_b ); assert len( self._sample_x ) == len( self._sample_x_ ); self._sample_y.append( y ); self._sample_c.append( c ); self._sample_b.append( b ); self._sample_x.append( x ); self._sample_x_.append( x_ ); return False; def _init( self ): self._needs_initialization = False; c = self._ddata.cursor(); c.jump(); keyfmt = '>' + ( 'I' * ( 1 + self._len_c + self._len_b ) ); valfmt = '>Q'; while True: r = c.get( True ); if not r: break; dbkey = unpack( keyfmt, r[0] ); dbval = unpack( valfmt, r[1] )[ 0 ]; additional_count = dbval; y = dbkey[ 0 ]; for ( i, value_of_variable_i ) in enumerate( dbkey[ 1: ] ): if not i in self._dmarginals: self._dmarginals[ i ] = {}; self._dmarginals[ i ][ (y,value_of_variable_i) ] \ = self._dmarginals[ i ].get( (y,value_of_variable_i), 0 ) \ + additional_count; for ( i, count_by_val ) in self._dmarginals.items(): total = 0; total_neg = 0; total_pos = 0; for ( ( y, val ), cnt ) in count_by_val.items(): total += cnt; if y == 0: total_neg += cnt; elif y == 1: total_pos += cnt; if self._rowcount is None: self._rowcount = total; assert self._rowcount == total; if self._total_neg is None: self._total_neg = total_neg; try: assert self._total_neg == total_neg; except: print( self._total_neg, total_neg ); raise; if self._total_pos is None: self._total_pos = total_pos; try: assert self._total_pos == total_pos; except: print( self._total_pos, total_pos ); raise; assert ( self._total_pos + self._total_neg ) == self._rowcount; for i in self._dmarginals: values = set([ val for (y,val) in self._dmarginals[ i ].keys() ]); if i not in self._dscores: self._dscores[ i ] = {}; for val in values: pos_cnt = self._dmarginals[ i ].get( (1,val), 0 ); neg_cnt = self._dmarginals[ i ].get( (0,val), 0 ); p_pos \ = log( float(pos_cnt) + SMOOTHING, 2.0 ) \ - log( float(self._total_pos) + float( len(values) ) * SMOOTHING, 2.0 ); p_neg \ = log( float(neg_cnt) + SMOOTHING, 2.0 ) \ - log( float(self._total_neg) + float( len(values) ) * SMOOTHING, 2.0 ); self._dscores[ i ][ val ] = p_pos - p_neg; p_pos \ = log( float(self._total_pos), 2.0 ) \ - log( float(self._rowcount), 2.0 ); p_neg \ = log( float(self._total_neg), 2.0 ) \ - log( float(self._rowcount), 2.0 ); self._bias = p_pos - p_neg; if False: for i in sorted( self._dscores.keys() ): score_by_val = self._dscores[ i ]; for ( val, score ) in score_by_val.items(): print( "{:d} {:10d} {:+2.4f}".format( i, val, score ) ); def _apply( self, row ): if self._needs_initialization: self._init(); ( c, b, x, x_ ) = row; ckeyfmt = '>' + ( 'I' * len(x_) ); cvalfmt = '>I' + ( 'f' * len(x) ); cvalsz = calcsize( cvalfmt ); rng = []; for xval in x_: rng.append( [ xv \ for xv \ in [ xval-2, xval-1, xval, xval+1, xval+2 ] \ if 0 <= xv <= 31 ] ); x_vec = np.array( x ).reshape( 1, self._len_x ).T; nearest_positive = []; all_negative = []; found_ident = 0; for xvals in product( *rng ): try: ckey = pack( ckeyfmt, *xvals ); except: print( ckeyfmt, xvals ); raise; val = self._cdata.get( ckey ); while val: if len(val) <= cvalsz: assert len(val) == cvalsz; val_ = val[:cvalsz]; val = val[cvalsz:]; pt = unpack( cvalfmt, val_ ); pt_y = pt[0]; pt_x = pt[1:]; pt_x_vec = np.array( pt_x ).reshape( 1, self._len_x ).T; diff = pt_x_vec - x_vec; dist = np.sqrt( np.dot( np.dot( diff.T, self._icov ), diff ) ); if dist <= 0.0001: found_ident += 1; continue; if pt_y == 0: all_negative.append( dist ); continue; assert pt_y == 1; nearest_positive.append( dist ); nearest_positive.sort(); nearest_positive = nearest_positive[:self._kval]; # assert found_ident == 1; # assert len( nearest_positive ) == self._kval; if len( nearest_positive ) < self._kval: self._sparse_points += 1; score = self._bias; # if len( nearest_positive ) > 0: if True: if len( nearest_positive ) == 0: threshold = None; else: threshold = nearest_positive[-1]; neg_cnt = 0; for dist in all_negative: if ( threshold is None ) or ( dist <= threshold ): neg_cnt += 1; p_pos \ = log( float( len(nearest_positive) ) + SMOOTHING, 2.0 ) \ - log( float(self._total_pos) + 2.0 * SMOOTHING, 2.0 ); p_neg \ = log( float(neg_cnt) + SMOOTHING, 2.0 ) \ - log( float(self._total_neg) + 2.0 * SMOOTHING, 2.0 ); score += p_pos - p_neg; for ( i, dval ) in enumerate( c+b ): score += self._dscores[ i ].get( dval, 0.0 ); if self._co is None: return score; else: if score >= self._co: return 1; else: return 0; def _finalize( self ): self._needs_finalization = False; covsample = np.array( self._sample_x ); cov = np.cov( covsample.T ); self._icov = LA.inv( cov ); sample \ = zip( self._sample_c, self._sample_b, self._sample_x, self._sample_x_ ); scores = []; for ( c, b, x, x_ ) in sample: scores.append( self._apply( [ c, b, x, x_ ] ) ); sorted_scores = list( sorted( scores ) ); cutoffs = []; for idx in range(0,1000): ratio = float(idx) / 1000.0; cutoffs.append( sorted_scores[ int( float( len(sorted_scores) ) * ratio ) ] ); if False: pprint( cutoffs ); stats_by_co = []; for coidx in range( 0, len(cutoffs) ): stats_by_co.append( { "tp": 0, "fp": 0, "tn": 0, "fn": 0 } ); for ( y, score ) in zip( self._sample_y, scores ): for ( coidx, co ) in enumerate( cutoffs ): if score >= co: if y == 1: stats_by_co[ coidx ][ "tp" ] += 1; else: assert y == 0; stats_by_co[ coidx ][ "fp" ] += 1; else: if y == 0: stats_by_co[ coidx ][ "tn" ] += 1; else: assert y == 1; stats_by_co[ coidx ][ "fn" ] += 1; max_fscore = None; max_fscore_coidx = None; for ( coidx, co ) in enumerate( cutoffs ): tp = stats_by_co[ coidx ][ "tp" ]; fp = stats_by_co[ coidx ][ "fp" ]; tn = stats_by_co[ coidx ][ "tn" ]; fn = stats_by_co[ coidx ][ "fn" ]; if (tp+fp) <= 0: continue; if (tp+fn) <= 0: continue; precision = float(tp) / float(tp+fp); recall = float(tp) / float(tp+fn); if (precision+recall) <= 0.0: continue; fscore = 2.0 * ( ( precision * recall ) / ( precision + recall ) ); if ( max_fscore is None ) or ( fscore > max_fscore ): max_fscore = fscore; max_fscore_coidx = coidx; assert max_fscore_coidx is not None; self._co = cutoffs[ max_fscore_coidx ]; # assert self._sparse_points == 0; if True: print( self._sparse_points ); print( self._co ); print( max_fscore ); def __call__( self, row ): ( c, b, x ) = row; c = self._fsel.apply_c( self._catfe( c ) ); b = self._fsel.apply_b( self._binfe( b ) ); x = self._contfe( x ); x_ = self._fdisc( x ); x = self._fsel.apply_x( x ); x_ = self._fsel.apply_x( x_ ); try: assert self._len_c == len(c); assert self._len_b == len(b); assert self._len_x == len(x); assert self._len_x == len(x_); except: print( self._len_c, self._len_b, self._len_x ); raise; return self._apply( ( c, b, x, x_ ) );