def main(): if len(sys.argv) < 3: sys.stderr.write('Usage: %s outdir textfile1 textfile2 ...\n' % sys.argv[0]) sys.exit(1) outdir = sys.argv[1] tfdb = DB() if not tfdb.open(os.path.join(outdir, 'tf.kch'), DB.OWRITER | DB.OCREATE | DB.OTRUNCATE): sys.stderr.write('cannot open tfdb: %s\n' % str(tfdb.error)) sys.exit(1) dfdb = DB() if not dfdb.open(os.path.join(outdir, 'df.kch'), DB.OWRITER | DB.OCREATE | DB.OTRUNCATE): sys.stderr.write('cannot open dfdb: %s\n' % str(dfdb.error)) sys.exit(1) tfidfdb = DB() if not tfidfdb.open(os.path.join(outdir, 'tfidf.kch'), DB.OWRITER | DB.OCREATE | DB.OTRUNCATE): sys.stderr.write('cannot open tfidfdb: %s\n' % str(tfidfdb.error)) sys.exit(1) print 'Count words ...' for i in range(len(sys.argv) - 2): filename = sys.argv[i + 2] print '(%d/%d) %s' % (i + 1, len(sys.argv) - 2, filename) count_words(tfdb, dfdb, filename) print 'Calculate TFIDF ...' save_tfidf(tfdb, dfdb, tfidfdb) tfdb.close() dfdb.close() tfidfdb.close()
def __load_blast_data(blast): # Connect to kyoto db db = DB() if not db.open("/opt/gene2accession/gene2accession.kch", DB.OREADER): raise Exception("Could not load gene2accession.kch: " + str(db.error())) hits = {} gi_num = re.compile('gi\|([0-9]+)') for line in blast: split_line = line.split('\t') # Important data evalue = float(split_line[10]) gi_nums = gi_num.findall(split_line[12]) genome_ids = [db.get(x) for x in gi_nums if db.get(x) is not None] # Thanks to Peter's parser, the gi list and org list are the same # length (the first gi column is also the first gi in the "master" gi # column) for org in genome_ids: if org in hits: hits[org].append(evalue) else: hits[org] = [evalue] db.close() return hits
def get_items(item_filter, db_file, page=0): item_iter = 0 items = [] db = DB() if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE): print "Could not open database." cur = db.cursor() cur.jump_back() while len(items) < FILTER_MAX: rec = cur.get(False) if not rec: break if item_iter != (FILTER_MAX * page): if item_filter(rec): item_iter = item_iter + 1 cur.step_back() continue if item_filter(rec): items.append(rec) cur.step_back() cur.disable() db.close() sorted_items = sorted(items, key=get_key, reverse=True) sorted_items_for_viewing = [loads(item[1]) for item in sorted_items] for item in sorted_items_for_viewing: if item['title'] is None or item['title'] == "": item['title'] = item['url'] return sorted_items_for_viewing
def get_post_num(post_num, db_file): item = None db = DB() if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE): print "Could not open database." cur = db.cursor() cur.jump() i = 0 while True: rec = cur.get(False) if not rec: break if i == post_num: item = rec cur.step() i = i + 1 cur.disable() db.close() if item is not None: return loads(item[1]) return dict()
def aggregate_by_hour(db_file): # Initialize the dict with each hour hours = {key: 0 for key in range(0, 24)} db = DB() if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE): print "Could not open database." cur = db.cursor() cur.jump_back() while True: rec = cur.get(False) if not rec: break loaded = loads(rec[1]) unix = float(loaded['created_at']) time = datetime.fromtimestamp(unix) hours[time.hour] = hours[time.hour] + 1 cur.step_back() cur.disable() db.close() hours = [{ 'name': "{}:00".format(key), 'data': [hours[key]] } for key in hours] return hours
def init_write(self): self.mode = "write" if self.ext == ".csv": self._data_file = open(self.filename, "wb") self._writer = csv.writer(self._data_file) if self.headers: self._writer.writerow(self.headers) elif self.ext == ".json": self._storage = {} elif self.ext == ".kch": from kyotocabinet import DB self._storage = DB() if not self._storage.open(self.filename, DB.OWRITER | DB.OCREATE): msg = "Error opening kyotocabinet db: %s" % ( self._storage.error()) raise dexy.exceptions.UserFeedback(msg) elif self.ext == ".sqlite3": self.init_write_sqlite3() else: raise dexy.exceptions.UserFeedback("unsupported extension %s" % self.ext)
def startup(self): ''' Open the index. ''' # pylint: disable=import-error,import-outside-toplevel from kyotocabinet import DB self._kyoto = DB() self._kyoto.open(self.path, DB.OWRITER | DB.OCREATE)
def top_things(db_file): urls = {} people = {} graph = {} db = DB() if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE): print "Could not open database. (Top things)" cur = db.cursor() cur.jump_back() while True: rec = cur.get(False) if not rec: break loaded_rec = loads(rec[1]) split = get_domain(loaded_rec) if urls.get(split, False) == False: urls[split] = 1 else: urls[split] = urls[split] + 1 person = loaded_rec['person'] if people.get(person, False) == False: people[person] = 1 else: people[person] = people[person] + 1 if split is not None and split is not "" and \ person is not None and person is not "": # Build a crazy relational graph out of my nosql data if graph.get(split, False) == False: graph[split] = { "is_person": False, "data": [person], "linked_to_count": 1 } elif person not in graph[split]: graph[split]["data"].append(person) graph[split][ "linked_to_count"] = graph[split]["linked_to_count"] + 1 if graph.get(person, False) == False: graph[person] = {"is_person": True, "data": [split]} elif split not in graph[person]: graph[person]["data"].append(split) cur.step_back() cur.disable() db.close() def get_one(x): return x[1] return (sorted(urls.items(), key=get_one, reverse=True), sorted(people.items(), key=get_one, reverse=True), graph)
def open_db(self, name): db = DB() db.open( join(self.path, "%s.kch"%name), DB.OWRITER | DB.OCREATE ) self.opendb.append(db) return db
def insert_item(url, person, db_file, submitted_title=''): mimetype = "application/json" db = DB() if not db.open("{0}".format(db_file), DB.OWRITER | DB.OCREATE): response = {} response[ 'What happened?'] = "Couldn't open the damn database. Error: {0}".format( db.error()) return Response(dumps(response), mimetype=mimetype) if is_url_in_db(db, url): return Response('{"What happened?": "Someone '\ 'tried to submit a duplicate URL."}', mimetype=mimetype) title = url summary = "~?~" try: thing = urlopen(url, timeout=10) soup = BeautifulSoup(thing) title = soup.title.string # Do some dumb summarizing if we can def concat(a, v): return a + " " + v.strip() visible_stuff = filter(visible, soup.findAll(text=True)) summary = reduce(concat, visible_stuff, "")[:900] + "..." except: pass #return Response('{"What happened?": '\ # 'I dunno bs4 messed up somehow."}', # mimetype=mimetype) created_at = int(mktime(datetime.now().utctimetuple())) is_image = url.lower().endswith(("jpg", "jpeg", "gif", "png")) thumbnail = gen_thumbnail_for_url(url, str(created_at)) record = { "created_at": created_at, "title": title, "url": url, "person": person, "summary": summary, "person_color": PERSON_COLORS[random.randint(0, len(PERSON_COLORS) - 1)], "is_image": is_image, "thumbnail": thumbnail, "comment": submitted_title } db.set(created_at, dumps(record)) db.close() return Response('{"What happened?": "MUDADA"}', mimetype=mimetype)
def __init__(self, db_file): #from train import TAG2ID, WORD2ID#, BAYES_RANK #self.ider = WORD2ID self.db = DB() self.db_file = db_file print path.join(KYOTO_DB_PATH, self.db_file) if not self.db.open(path.join(KYOTO_DB_PATH, self.db_file), DB.OWRITER | DB.OCREATE): print >> sys.stderr, "open error: " + str(self.db.error())
def __init__(self, dir, subdirs=[]): dir += "_zlib" if dir not in dbList: dbList[dir] = DB() dbList[dir].open( dir + ".kch#ops=c#log=" + dir + ".log#logkinds=debu#zcomp=zlib", DB.OWRITER | DB.OCREATE) self.db = dbList[dir] self.key = "/".join(subdirs)
def db_meta_info(): meta = {} db = DB() db_file = current_app.config['DB_FILE'] if not db.open("{0}".format(db_file), DB.OREADER): print "Could not open database (meta info)." meta["size"] = db.size() meta["count"] = db.count() db.close() return meta
def __init__(self, path): # create the database object self._path = path self._db = DB() # open the database if not self._db.open(path, DB.OREADER | DB.OWRITER | DB.OCREATE): raise GrapheekDataKyotoCabinetInitFailureException( str(self._db.error())) super(KyotoCabinetGraph, self).__init__() self._ensure_prepared() self._closed = False
def get_post_by_date(key, db_file): item = None db = DB() if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE): print "Could not open database." item = db.get(key) db.close() if item is not None: return loads(item) return dict()
def main(): db = DB() if not db.open("bayes.kch", DB.OWRITER | DB.OCREATE): return with open("word_tf.txt") as word_tf: for line in word_tf: line = line.strip() word, bayes_list = loads(line) print word if bayes_list: ar = array('I') ar.fromlist(lineiter(bayes_list)) db[word] = ar.tostring()
def dbOpen(name): db = DB() dbpathname = abspath(self.path) + '/' + name + ".kch" if self.create: # if not db.open(abspath(self.path) + '/' + name + ".kch", # DB.OWRITER | DB.OCREATE | DB.OAUTOSYNC | DB.OAUTOTRAN): if not db.open(dbpathname, DB.OWRITER | DB.OCREATE): raise IOError("open error: %s %s" % ( dbpathname, str(db.error()))) # pragma: NO COVER return db else: # if not db.open(abspath(self.path) + '/' + name + ".kch", # DB.OWRITER | DB.OAUTOSYNC | DB.OAUTOTRAN): if not db.open(dbpathname, DB.OWRITER): # pragma: NO COVER raise IOError("open error: %s %s" % ( dbpathname, str(db.error()))) # pragma: NO COVER return db
def decorated_function(*args, **kwargs): # Debug if not current_app.config['CACHE']: return f(*args, **kwargs) db = DB() db.open("/tmp/page_cache.kch") res = None fancy = hash("{}{}{}".format(db_meta_info()['count'], request.url, f.func_name)) res = db.get(fancy) if not res: res = f(*args, **kwargs) db.set(fancy, res) db.close() return res
def get_items_last_X_days(db_file, X, munge=True): dates = {} db = DB() if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE): print "Could not open database." X_days_ago = datetime.now() - timedelta(days=X) cur = db.cursor() cur.jump_back() while True: rec = cur.get(False) if not rec: break loaded = loads(rec[1]) unix = float(loaded['created_at']) time = datetime.fromtimestamp(unix) if time > X_days_ago: if munge: date_obj = date(year=time.year, month=time.month, day=time.day) else: date_obj = time # Javascript expects Date.UTC to spit out dates of a certain # length. day_unix = int(mktime(date_obj.timetuple())) * 1000 if dates.get(day_unix, False) == False: dates[day_unix] = {loaded["person"]: 1} else: relevant_dict = dates[day_unix] if relevant_dict.get(loaded["person"], False) == False: relevant_dict[loaded["person"]] = 1 else: relevant_dict[ loaded["person"]] = relevant_dict[loaded["person"]] + 1 else: break cur.step_back() cur.disable() db.close() return dates
def init_read(self): self.mode = "read" if self.ext == ".csv": self._file = open(self.filename, "rb") elif self.ext == ".json": with open(self.filename, "rb") as f: self._storage = json.load(f) elif self.ext == ".kch": from kyotocabinet import DB self._storage = DB() self._storage.open(self.filename, DB.OREADER) elif self.ext == ".sqlite3": import sqlite3 self._storage = sqlite3.connect(self.filename) self._cursor = self._storage.cursor() else: raise dexy.exceptions.UserFeedback("unsupported extension %s" % self.ext)
def get_last_items(db_file, pages=1): items = [] db = DB() if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE): print "Could not open database." cur = db.cursor() cur.jump_back() while len(items) < (pages * FILTER_MAX): rec = cur.get(False) if not rec: break items.append(rec) cur.step_back() cur.disable() db.close() return items
def get_all_items(db_file): items = [] db = DB() if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE): print "Could not open database." cur = db.cursor() cur.jump() while True: rec = cur.get(False) if not rec: break items.append(rec) cur.step() cur.disable() db.close() sorted_items_for_viewing = [loads(item[1]) for item in items] return sorted_items_for_viewing
def gen_thumbnails(db_file): db = DB() if not db.open("{0}".format(db_file), DB.OREADER | DB.OWRITER): sys.exit(1) cur = db.cursor() cur.jump_back() while True: rec = cur.get(False) if not rec: break loaded = loads(rec[1]) is_image = loaded["url"].lower().endswith( ("jpg", "jpeg", "gif", "png")) if is_image: print "Thumbnailing {}".format(loaded["url"]) loaded["is_image"] = True try: thumbnail = gen_thumbnail_for_url(loaded["url"], rec[0]) except IOError as e: print "IOError: {}".format(e) print "Save result: {}".format(cur.set_value(dumps(loaded))) cur.step_back() continue if thumbnail: loaded["thumbnail"] = thumbnail print "Thumbnailed {}".format(loaded["url"]) print "Save result: {}".format(cur.set_value(dumps(loaded))) cur.step_back() cur.disable() db.close() return True
def __init__(self, path, truncate=False): """ Open a new connection to a database using the Kyoto Cabinet engine. Args: * path (str): Path to database. Kwargs: * truncate (bool, False): If database should be truncated before opening. """ self.db = DB() self.batchsize = 1000 self.batch = {} dbparams = '.kct#apow=0#bnum=10000000#msiz=' + str(2 << 30) if truncate: result = self.db.open(path + dbparams, DB.OWRITER | DB.OCREATE | DB.OTRUNCATE) else: result = self.db.open(path + dbparams, DB.OWRITER) if not result: raise PathError('DNA outdb open error: %s ' % self.db.error()) exit(1)
def get_items_on_page(page, db_file): item_iter = 0 items = [] db = DB() if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE): print "Could not open database." cur = db.cursor() cur.jump_back() while len(items) < FILTER_MAX: rec = cur.get(False) if not rec: break if item_iter >= (FILTER_MAX * page): items.append(rec) item_iter = item_iter + 1 cur.step_back() cur.disable() db.close() return items
def main(): db_file = argv[1] username = argv[2] if not db_file and not username: print "Need db_file and username." return -1 db = DB() if not db.open("{0}".format(db_file), DB.OWRITER): print "Could not open database." return -1 all_keys = [] cur = db.cursor() cur.jump() while True: rec = cur.get(False) if not rec: break loaded = loads(rec[1]) if loaded["person"] == username: all_keys.append(cur.get_key()) cur.step() cur.disable() print "Found {} records.".format(len(all_keys)) for key in all_keys: print "Pending {}...".format(key) if len(argv) > 3 and argv[3] == '--delete': print "Removing {}...".format(key) if not db.remove(key): print "Could not remove key: {}".format(db.error()) db.close()
def get_page_count(item_filter=lambda x: True): count = 0 db = DB() db_file = current_app.config['DB_FILE'] if not db.open("{0}".format(db_file), DB.OREADER | DB.OWRITER | DB.OCREATE): print "Could not open database (get_page_count). Error: {}".format( db.error()) cur = db.cursor() cur.jump_back() while True: rec = cur.get(False) if not rec: break if item_filter(rec): count = count + 1 cur.step_back() cur.disable() db.close() return count / FILTER_MAX
def get_user_stats(username, db_file): item = { "username": username, "aliases": [], "total_posts": 0, "domains": {}, "first_post_date": None, "first_post_date_unix": None, "most_recent_post": None, "most_recent_post_unix": 0, "average_posts_per_hour": 0.0, "average_posts_per_day": 0.0, "average_posts_per_week": 0.0 } db = DB() if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE): print "Could not open database." cur = db.cursor() cur.jump() while True: rec = cur.get(False) if not rec: break loaded_rec = loads(rec[1]) if loaded_rec['person'] != username: cur.step() continue # Looks like this is a post by the user we're looking for split = get_domain(loaded_rec) if item['domains'].get(split, False) == False: item['domains'][split] = 1 else: item['domains'][split] = item['domains'][split] + 1 if item['first_post_date_unix'] is None: item['first_post_date_unix'] = loaded_rec['created_at'] if item['most_recent_post_unix'] < loaded_rec['created_at']: item['most_recent_post_unix'] = loaded_rec['created_at'] item['total_posts'] = item['total_posts'] + 1 cur.step() cur.disable() db.close() # Clean up everything first_time = None if item['first_post_date_unix'] is not None: unix = float(item['first_post_date_unix']) first_time = datetime.fromtimestamp(unix) item['first_post_date'] = first_time.isoformat() recent_time = None if item['most_recent_post_unix'] is not None: unix = float(item['most_recent_post_unix']) recent_time = datetime.fromtimestamp(unix) item['most_recent_post'] = recent_time.isoformat() if first_time and recent_time: delta = recent_time - first_time item['user_age_days'] = delta.days item['user_age_seconds'] = delta.total_seconds() item['average_posts_per_hour'] = item['total_posts'] / ( delta.total_seconds() / 60.0) item['average_posts_per_day'] = item['total_posts'] / ( delta.total_seconds() / 60.0 / 24.0) item['average_posts_per_week'] = item['total_posts'] / ( delta.total_seconds() / 60.0 / 24.0 / 7.0) return item
return tf_idf(word_list) from kyotocabinet import DB from collections import defaultdict from array import array from zkit.zitertools import chunkiter from operator import itemgetter from zdata.tag.name2id import NAME2ID from zkit.txt_cleanup import sp_txt ID2NAME = defaultdict(list) for name, id in NAME2ID.iteritems(): ID2NAME[id].append(name) db_tag_bayes = DB() db_tag_bayes.open(join(ZDATA_PATH, "data/bayes.kch"), DB.OREADER) def tag_id_rank_list_by_txt(txt): txt = txt.lower() tag_id_list_rank = defaultdict(int) for word, rank in tf_idf_seg_txt(txt): #print word ars = db_tag_bayes.get(word) if ars: ar = array('I') ar.fromstring(ars) #print len(ar) #print db_tag_bayes[word] #print word, ar
def __init__(self, dir, subdirs=[]): if dir not in dbList: dbList[dir] = DB() dbList[dir].open(dir + ".kch", DB.OWRITER | DB.OCREATE) self.db = dbList[dir] self.key = "/".join(subdirs)