Exemplo n.º 1
0
def main():
    if len(sys.argv) < 3:
        sys.stderr.write('Usage: %s outdir textfile1 textfile2 ...\n' %
                         sys.argv[0])
        sys.exit(1)
    outdir = sys.argv[1]
    tfdb = DB()
    if not tfdb.open(os.path.join(outdir, 'tf.kch'),
                     DB.OWRITER | DB.OCREATE | DB.OTRUNCATE):
        sys.stderr.write('cannot open tfdb: %s\n' % str(tfdb.error))
        sys.exit(1)
    dfdb = DB()
    if not dfdb.open(os.path.join(outdir, 'df.kch'),
                     DB.OWRITER | DB.OCREATE | DB.OTRUNCATE):
        sys.stderr.write('cannot open dfdb: %s\n' % str(dfdb.error))
        sys.exit(1)
    tfidfdb = DB()
    if not tfidfdb.open(os.path.join(outdir, 'tfidf.kch'),
                        DB.OWRITER | DB.OCREATE | DB.OTRUNCATE):
        sys.stderr.write('cannot open tfidfdb: %s\n' % str(tfidfdb.error))
        sys.exit(1)

    print 'Count words ...'
    for i in range(len(sys.argv) - 2):
        filename = sys.argv[i + 2]
        print '(%d/%d) %s' % (i + 1, len(sys.argv) - 2, filename)
        count_words(tfdb, dfdb, filename)
    print 'Calculate TFIDF ...'
    save_tfidf(tfdb, dfdb, tfidfdb)

    tfdb.close()
    dfdb.close()
    tfidfdb.close()
Exemplo n.º 2
0
def __load_blast_data(blast):
    # Connect to kyoto db
    db = DB()
    if not db.open("/opt/gene2accession/gene2accession.kch", DB.OREADER):
        raise Exception("Could not load gene2accession.kch: " + str(db.error()))

    hits = {}
    gi_num = re.compile('gi\|([0-9]+)')
    for line in blast:
        split_line = line.split('\t')

        # Important data
        evalue = float(split_line[10])

        gi_nums = gi_num.findall(split_line[12])
        genome_ids = [db.get(x) for x in gi_nums if db.get(x) is not None]

        # Thanks to Peter's parser, the gi list and org list are the same
        # length (the first gi column is also the first gi in the "master" gi
        # column)
        for org in genome_ids:
            if org in hits:
                hits[org].append(evalue)
            else:
                hits[org] = [evalue]
    db.close()
    return hits
Exemplo n.º 3
0
def get_items(item_filter, db_file, page=0):
    item_iter = 0
    items = []
    db = DB()
    if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE):
        print "Could not open database."

    cur = db.cursor()
    cur.jump_back()
    while len(items) < FILTER_MAX:
        rec = cur.get(False)
        if not rec:
            break

        if item_iter != (FILTER_MAX * page):
            if item_filter(rec):
                item_iter = item_iter + 1
            cur.step_back()
            continue

        if item_filter(rec):
            items.append(rec)

        cur.step_back()
    cur.disable()
    db.close()

    sorted_items = sorted(items, key=get_key, reverse=True)
    sorted_items_for_viewing = [loads(item[1]) for item in sorted_items]
    for item in sorted_items_for_viewing:
        if item['title'] is None or item['title'] == "":
            item['title'] = item['url']
    return sorted_items_for_viewing
Exemplo n.º 4
0
def get_post_num(post_num, db_file):
    item = None
    db = DB()
    if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE):
        print "Could not open database."

    cur = db.cursor()
    cur.jump()
    i = 0
    while True:
        rec = cur.get(False)
        if not rec:
            break

        if i == post_num:
            item = rec

        cur.step()
        i = i + 1

    cur.disable()
    db.close()

    if item is not None:
        return loads(item[1])
    return dict()
Exemplo n.º 5
0
def aggregate_by_hour(db_file):
    # Initialize the dict with each hour
    hours = {key: 0 for key in range(0, 24)}
    db = DB()

    if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE):
        print "Could not open database."

    cur = db.cursor()
    cur.jump_back()

    while True:
        rec = cur.get(False)
        if not rec:
            break

        loaded = loads(rec[1])
        unix = float(loaded['created_at'])
        time = datetime.fromtimestamp(unix)

        hours[time.hour] = hours[time.hour] + 1

        cur.step_back()
    cur.disable()
    db.close()

    hours = [{
        'name': "{}:00".format(key),
        'data': [hours[key]]
    } for key in hours]
    return hours
Exemplo n.º 6
0
    def init_write(self):
        self.mode = "write"

        if self.ext == ".csv":
            self._data_file = open(self.filename, "wb")
            self._writer = csv.writer(self._data_file)
            if self.headers:
                self._writer.writerow(self.headers)

        elif self.ext == ".json":
            self._storage = {}

        elif self.ext == ".kch":
            from kyotocabinet import DB
            self._storage = DB()
            if not self._storage.open(self.filename, DB.OWRITER | DB.OCREATE):
                msg = "Error opening kyotocabinet db: %s" % (
                    self._storage.error())
                raise dexy.exceptions.UserFeedback(msg)

        elif self.ext == ".sqlite3":
            self.init_write_sqlite3()

        else:
            raise dexy.exceptions.UserFeedback("unsupported extension %s" %
                                               self.ext)
Exemplo n.º 7
0
 def startup(self):
   ''' Open the index.
   '''
   # pylint: disable=import-error,import-outside-toplevel
   from kyotocabinet import DB
   self._kyoto = DB()
   self._kyoto.open(self.path, DB.OWRITER | DB.OCREATE)
Exemplo n.º 8
0
def top_things(db_file):
    urls = {}
    people = {}
    graph = {}

    db = DB()

    if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE):
        print "Could not open database. (Top things)"

    cur = db.cursor()
    cur.jump_back()
    while True:
        rec = cur.get(False)
        if not rec:
            break

        loaded_rec = loads(rec[1])
        split = get_domain(loaded_rec)

        if urls.get(split, False) == False:
            urls[split] = 1
        else:
            urls[split] = urls[split] + 1

        person = loaded_rec['person']
        if people.get(person, False) == False:
            people[person] = 1
        else:
            people[person] = people[person] + 1

        if split is not None and split is not "" and \
            person is not None and person is not "":
            # Build a crazy relational graph out of my nosql data
            if graph.get(split, False) == False:
                graph[split] = {
                    "is_person": False,
                    "data": [person],
                    "linked_to_count": 1
                }
            elif person not in graph[split]:
                graph[split]["data"].append(person)
                graph[split][
                    "linked_to_count"] = graph[split]["linked_to_count"] + 1

            if graph.get(person, False) == False:
                graph[person] = {"is_person": True, "data": [split]}
            elif split not in graph[person]:
                graph[person]["data"].append(split)

        cur.step_back()
    cur.disable()
    db.close()

    def get_one(x):
        return x[1]

    return (sorted(urls.items(), key=get_one, reverse=True),
            sorted(people.items(), key=get_one, reverse=True), graph)
Exemplo n.º 9
0
 def open_db(self, name):
     db = DB()
     db.open(
         join(self.path, "%s.kch"%name),
         DB.OWRITER | DB.OCREATE
     )
     self.opendb.append(db)
     return db
Exemplo n.º 10
0
def insert_item(url, person, db_file, submitted_title=''):
    mimetype = "application/json"
    db = DB()

    if not db.open("{0}".format(db_file), DB.OWRITER | DB.OCREATE):

        response = {}
        response[
            'What happened?'] = "Couldn't open the damn database. Error: {0}".format(
                db.error())
        return Response(dumps(response), mimetype=mimetype)

    if is_url_in_db(db, url):
        return Response('{"What happened?": "Someone '\
            'tried to submit a duplicate URL."}',
            mimetype=mimetype)

    title = url
    summary = "~?~"
    try:
        thing = urlopen(url, timeout=10)
        soup = BeautifulSoup(thing)
        title = soup.title.string

        # Do some dumb summarizing if we can
        def concat(a, v):
            return a + " " + v.strip()

        visible_stuff = filter(visible, soup.findAll(text=True))
        summary = reduce(concat, visible_stuff, "")[:900] + "..."
    except:
        pass
        #return Response('{"What happened?": '\
        #    'I dunno bs4 messed up somehow."}',
        #    mimetype=mimetype)

    created_at = int(mktime(datetime.now().utctimetuple()))

    is_image = url.lower().endswith(("jpg", "jpeg", "gif", "png"))
    thumbnail = gen_thumbnail_for_url(url, str(created_at))

    record = {
        "created_at": created_at,
        "title": title,
        "url": url,
        "person": person,
        "summary": summary,
        "person_color": PERSON_COLORS[random.randint(0,
                                                     len(PERSON_COLORS) - 1)],
        "is_image": is_image,
        "thumbnail": thumbnail,
        "comment": submitted_title
    }
    db.set(created_at, dumps(record))
    db.close()

    return Response('{"What happened?": "MUDADA"}', mimetype=mimetype)
Exemplo n.º 11
0
 def __init__(self, db_file):
     #from train import TAG2ID, WORD2ID#, BAYES_RANK
     #self.ider = WORD2ID
     self.db = DB()
     self.db_file = db_file
     print path.join(KYOTO_DB_PATH, self.db_file)
     if not self.db.open(path.join(KYOTO_DB_PATH, self.db_file),
                         DB.OWRITER | DB.OCREATE):
         print >> sys.stderr, "open error: " + str(self.db.error())
Exemplo n.º 12
0
Arquivo: cache.py Projeto: l0rb/og_api
 def __init__(self, dir, subdirs=[]):
     dir += "_zlib"
     if dir not in dbList:
         dbList[dir] = DB()
         dbList[dir].open(
             dir + ".kch#ops=c#log=" + dir +
             ".log#logkinds=debu#zcomp=zlib", DB.OWRITER | DB.OCREATE)
     self.db = dbList[dir]
     self.key = "/".join(subdirs)
Exemplo n.º 13
0
def db_meta_info():
    meta = {}
    db = DB()
    db_file = current_app.config['DB_FILE']
    if not db.open("{0}".format(db_file), DB.OREADER):
        print "Could not open database (meta info)."
    meta["size"] = db.size()
    meta["count"] = db.count()
    db.close()

    return meta
Exemplo n.º 14
0
 def __init__(self, path):
     # create the database object
     self._path = path
     self._db = DB()
     # open the database
     if not self._db.open(path, DB.OREADER | DB.OWRITER | DB.OCREATE):
         raise GrapheekDataKyotoCabinetInitFailureException(
             str(self._db.error()))
     super(KyotoCabinetGraph, self).__init__()
     self._ensure_prepared()
     self._closed = False
Exemplo n.º 15
0
def get_post_by_date(key, db_file):
    item = None
    db = DB()
    if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE):
        print "Could not open database."
    item = db.get(key)

    db.close()
    if item is not None:
        return loads(item)
    return dict()
Exemplo n.º 16
0
def main():
    db = DB()
    if not db.open("bayes.kch", DB.OWRITER | DB.OCREATE):
        return

    with open("word_tf.txt") as word_tf:
        for line in word_tf:
            line = line.strip()
            word, bayes_list = loads(line)
            print word
            if bayes_list:
                ar = array('I')
                ar.fromlist(lineiter(bayes_list))
                db[word] = ar.tostring()
Exemplo n.º 17
0
 def dbOpen(name):
     db = DB()
     dbpathname = abspath(self.path) + '/' + name + ".kch"
     if self.create:
         # if not db.open(abspath(self.path) + '/' + name + ".kch",
         # DB.OWRITER | DB.OCREATE | DB.OAUTOSYNC | DB.OAUTOTRAN):
         if not db.open(dbpathname, DB.OWRITER | DB.OCREATE):
             raise IOError("open error: %s %s" % (
                 dbpathname, str(db.error())))  # pragma: NO COVER
         return db
     else:
         # if not db.open(abspath(self.path) + '/' + name + ".kch",
         #         DB.OWRITER | DB.OAUTOSYNC | DB.OAUTOTRAN):
         if not db.open(dbpathname, DB.OWRITER):  # pragma: NO COVER
             raise IOError("open error: %s %s" % (
                 dbpathname, str(db.error())))  # pragma: NO COVER
         return db
Exemplo n.º 18
0
    def decorated_function(*args, **kwargs):
        # Debug
        if not current_app.config['CACHE']:
            return f(*args, **kwargs)

        db = DB()
        db.open("/tmp/page_cache.kch")
        res = None
        fancy = hash("{}{}{}".format(db_meta_info()['count'], request.url, f.func_name))

        res = db.get(fancy)
        if not res:
            res = f(*args, **kwargs)
            db.set(fancy, res)

        db.close()
        return res
Exemplo n.º 19
0
def get_items_last_X_days(db_file, X, munge=True):
    dates = {}
    db = DB()
    if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE):
        print "Could not open database."

    X_days_ago = datetime.now() - timedelta(days=X)

    cur = db.cursor()
    cur.jump_back()
    while True:
        rec = cur.get(False)
        if not rec:
            break

        loaded = loads(rec[1])
        unix = float(loaded['created_at'])
        time = datetime.fromtimestamp(unix)

        if time > X_days_ago:
            if munge:
                date_obj = date(year=time.year, month=time.month, day=time.day)
            else:
                date_obj = time
            # Javascript expects Date.UTC to spit out dates of a certain
            # length.
            day_unix = int(mktime(date_obj.timetuple())) * 1000
            if dates.get(day_unix, False) == False:
                dates[day_unix] = {loaded["person"]: 1}
            else:
                relevant_dict = dates[day_unix]

                if relevant_dict.get(loaded["person"], False) == False:
                    relevant_dict[loaded["person"]] = 1
                else:
                    relevant_dict[
                        loaded["person"]] = relevant_dict[loaded["person"]] + 1
        else:
            break

        cur.step_back()
    cur.disable()
    db.close()

    return dates
Exemplo n.º 20
0
    def init_read(self):
        self.mode = "read"

        if self.ext == ".csv":
            self._file = open(self.filename, "rb")
        elif self.ext == ".json":
            with open(self.filename, "rb") as f:
                self._storage = json.load(f)
        elif self.ext == ".kch":
            from kyotocabinet import DB
            self._storage = DB()
            self._storage.open(self.filename, DB.OREADER)
        elif self.ext == ".sqlite3":
            import sqlite3
            self._storage = sqlite3.connect(self.filename)
            self._cursor = self._storage.cursor()
        else:
            raise dexy.exceptions.UserFeedback("unsupported extension %s" %
                                               self.ext)
Exemplo n.º 21
0
def get_last_items(db_file, pages=1):
    items = []
    db = DB()
    if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE):
        print "Could not open database."

    cur = db.cursor()
    cur.jump_back()
    while len(items) < (pages * FILTER_MAX):
        rec = cur.get(False)
        if not rec:
            break

        items.append(rec)
        cur.step_back()
    cur.disable()
    db.close()

    return items
Exemplo n.º 22
0
def get_all_items(db_file):
    items = []
    db = DB()
    if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE):
        print "Could not open database."

    cur = db.cursor()
    cur.jump()
    while True:
        rec = cur.get(False)
        if not rec:
            break
        items.append(rec)
        cur.step()

    cur.disable()
    db.close()

    sorted_items_for_viewing = [loads(item[1]) for item in items]
    return sorted_items_for_viewing
Exemplo n.º 23
0
def gen_thumbnails(db_file):
    db = DB()
    if not db.open("{0}".format(db_file), DB.OREADER | DB.OWRITER):
        sys.exit(1)

    cur = db.cursor()
    cur.jump_back()
    while True:
        rec = cur.get(False)

        if not rec:
            break

        loaded = loads(rec[1])
        is_image = loaded["url"].lower().endswith(
            ("jpg", "jpeg", "gif", "png"))

        if is_image:
            print "Thumbnailing {}".format(loaded["url"])
            loaded["is_image"] = True
            try:
                thumbnail = gen_thumbnail_for_url(loaded["url"], rec[0])
            except IOError as e:
                print "IOError: {}".format(e)
                print "Save result: {}".format(cur.set_value(dumps(loaded)))
                cur.step_back()
                continue

            if thumbnail:
                loaded["thumbnail"] = thumbnail
                print "Thumbnailed {}".format(loaded["url"])
                print "Save result: {}".format(cur.set_value(dumps(loaded)))

        cur.step_back()

    cur.disable()
    db.close()

    return True
Exemplo n.º 24
0
    def __init__(self, path, truncate=False):
        """
        Open a new connection to a database using the Kyoto Cabinet engine.

        Args:
            * path (str): Path to database.

        Kwargs:
            * truncate (bool, False): If database should be truncated before opening.
        """
        self.db = DB()
        self.batchsize = 1000
        self.batch = {}
        dbparams = '.kct#apow=0#bnum=10000000#msiz=' + str(2 << 30)
        if truncate:
            result = self.db.open(path + dbparams,
                                  DB.OWRITER | DB.OCREATE | DB.OTRUNCATE)
        else:
            result = self.db.open(path + dbparams, DB.OWRITER)
        if not result:
            raise PathError('DNA outdb open error: %s ' % self.db.error())
            exit(1)
Exemplo n.º 25
0
def get_items_on_page(page, db_file):
    item_iter = 0
    items = []
    db = DB()
    if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE):
        print "Could not open database."

    cur = db.cursor()
    cur.jump_back()
    while len(items) < FILTER_MAX:
        rec = cur.get(False)
        if not rec:
            break

        if item_iter >= (FILTER_MAX * page):
            items.append(rec)

        item_iter = item_iter + 1
        cur.step_back()
    cur.disable()
    db.close()

    return items
Exemplo n.º 26
0
def main():
    db_file = argv[1]
    username = argv[2]

    if not db_file and not username:
        print "Need db_file and username."
        return -1

    db = DB()
    if not db.open("{0}".format(db_file), DB.OWRITER):
        print "Could not open database."
        return -1

    all_keys = []
    cur = db.cursor()
    cur.jump()
    while True:
        rec = cur.get(False)
        if not rec:
            break

        loaded = loads(rec[1])
        if loaded["person"] == username:
            all_keys.append(cur.get_key())

        cur.step()
    cur.disable()

    print "Found {} records.".format(len(all_keys))
    for key in all_keys:
        print "Pending {}...".format(key)
        if len(argv) > 3 and argv[3] == '--delete':
            print "Removing {}...".format(key)
            if not db.remove(key):
                print "Could not remove key: {}".format(db.error())

    db.close()
Exemplo n.º 27
0
def get_page_count(item_filter=lambda x: True):
    count = 0
    db = DB()
    db_file = current_app.config['DB_FILE']
    if not db.open("{0}".format(db_file),
                   DB.OREADER | DB.OWRITER | DB.OCREATE):
        print "Could not open database (get_page_count). Error: {}".format(
            db.error())

    cur = db.cursor()
    cur.jump_back()
    while True:
        rec = cur.get(False)
        if not rec:
            break

        if item_filter(rec):
            count = count + 1

        cur.step_back()

    cur.disable()
    db.close()
    return count / FILTER_MAX
Exemplo n.º 28
0
def get_user_stats(username, db_file):
    item = {
        "username": username,
        "aliases": [],
        "total_posts": 0,
        "domains": {},
        "first_post_date": None,
        "first_post_date_unix": None,
        "most_recent_post": None,
        "most_recent_post_unix": 0,
        "average_posts_per_hour": 0.0,
        "average_posts_per_day": 0.0,
        "average_posts_per_week": 0.0
    }

    db = DB()
    if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE):
        print "Could not open database."

    cur = db.cursor()
    cur.jump()
    while True:
        rec = cur.get(False)
        if not rec:
            break

        loaded_rec = loads(rec[1])
        if loaded_rec['person'] != username:
            cur.step()
            continue

        # Looks like this is a post by the user we're looking for
        split = get_domain(loaded_rec)

        if item['domains'].get(split, False) == False:
            item['domains'][split] = 1
        else:
            item['domains'][split] = item['domains'][split] + 1

        if item['first_post_date_unix'] is None:
            item['first_post_date_unix'] = loaded_rec['created_at']

        if item['most_recent_post_unix'] < loaded_rec['created_at']:
            item['most_recent_post_unix'] = loaded_rec['created_at']

        item['total_posts'] = item['total_posts'] + 1

        cur.step()

    cur.disable()
    db.close()

    # Clean up everything

    first_time = None
    if item['first_post_date_unix'] is not None:
        unix = float(item['first_post_date_unix'])
        first_time = datetime.fromtimestamp(unix)
        item['first_post_date'] = first_time.isoformat()

    recent_time = None
    if item['most_recent_post_unix'] is not None:
        unix = float(item['most_recent_post_unix'])
        recent_time = datetime.fromtimestamp(unix)
        item['most_recent_post'] = recent_time.isoformat()

    if first_time and recent_time:
        delta = recent_time - first_time
        item['user_age_days'] = delta.days
        item['user_age_seconds'] = delta.total_seconds()
        item['average_posts_per_hour'] = item['total_posts'] / (
            delta.total_seconds() / 60.0)
        item['average_posts_per_day'] = item['total_posts'] / (
            delta.total_seconds() / 60.0 / 24.0)
        item['average_posts_per_week'] = item['total_posts'] / (
            delta.total_seconds() / 60.0 / 24.0 / 7.0)

    return item
Exemplo n.º 29
0
    return tf_idf(word_list)


from kyotocabinet import DB
from collections import defaultdict
from array import array
from zkit.zitertools import chunkiter
from operator import itemgetter
from zdata.tag.name2id import NAME2ID
from zkit.txt_cleanup import sp_txt

ID2NAME = defaultdict(list)
for name, id in NAME2ID.iteritems():
    ID2NAME[id].append(name)

db_tag_bayes = DB()
db_tag_bayes.open(join(ZDATA_PATH, "data/bayes.kch"), DB.OREADER)


def tag_id_rank_list_by_txt(txt):
    txt = txt.lower()
    tag_id_list_rank = defaultdict(int)
    for word, rank in tf_idf_seg_txt(txt):
        #print word
        ars = db_tag_bayes.get(word)
        if ars:
            ar = array('I')
            ar.fromstring(ars)
            #print len(ar)
            #print db_tag_bayes[word]
            #print word, ar
Exemplo n.º 30
0
Arquivo: cache.py Projeto: l0rb/og_api
 def __init__(self, dir, subdirs=[]):
     if dir not in dbList:
         dbList[dir] = DB()
         dbList[dir].open(dir + ".kch", DB.OWRITER | DB.OCREATE)
     self.db = dbList[dir]
     self.key = "/".join(subdirs)