/
CDB_Writer.py
59 lines (51 loc) · 2.14 KB
/
CDB_Writer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# -*-coding: utf-8 -*-
import os
import codecs
import cdb
import subprocess
LFS_DEFAULT = 2.5 * (1024**3) # 2.5GB(file)-> about 3.3GB(cdb)
class CDB_Writer(object):
def __init__(self, dbname, keyMapFile, limit_file_size=LFS_DEFAULT,
fetch=1000000, encoding='utf-8'):
# the options.
self.dbname = dbname
# used by CDB_Reader to decide which cdb includes the query key
self.keyMapFile = keyMapFile
self.limit_file_size = limit_file_size
# determines how often to check if current cdb size exceeds the limit
self.fetch = fetch
self.record_counter = 0
self.num_of_cdbs = 0
self.encoding = encoding
dbname = "{}.{}".format(self.dbname, self.num_of_cdbs)
print "processing {}".format(dbname)
dbname_tmp = dbname + ".tmp"
self.tmpfile = dbname_tmp
self.cdb = cdb.cdbmake(dbname, dbname_tmp)
dbdir = os.path.dirname(self.dbname)
keyMapPath = "{}/{}".format(dbdir, keyMapFile)
self.keymap = codecs.open(keyMapPath, 'w', self.encoding)
def __del__(self):
self.cdb.finish()
del self.cdb
self.keymap.close()
def add(self, key, value):
if self.record_counter % self.fetch == 0:
proc = subprocess.Popen(['wc', '-c', self.tmpfile],
stdout=subprocess.PIPE)
size = proc.stdout.read().strip().split(' ')[0]
if int(size) > self.limit_file_size:
self.cdb.finish()
del self.cdb
self.num_of_cdbs += 1
dbnamei = "{}.{}".format(self.dbname, self.num_of_cdbs)
print "processing {}".format(dbnamei)
dbnamei_tmp = dbnamei + ".tmp"
self.tmpfile = dbnamei_tmp
self.cdb = cdb.cdbmake(dbnamei, dbnamei_tmp)
self.record_counter = 0
# save head keys of each splitted cdbs
filebase = os.path.basename(dbnamei)
self.keymap.write(u"{} {}\n".format(key, filebase))
self.record_counter += 1
self.cdb.add(key.encode(self.encoding), value)