def get(self, searchKey, exhaustive=False): # exhaustive must be True if keys are not sorted in ascending order if exhaustive: for i in range(len(self.mapping)): nowCDB = self.mapping[i]['cdb'] targetCDB = cdb.init(nowCDB) if self.repeated_keys: value = targetCDB.getall(searchKey.encode('utf-8')) else: value = targetCDB.get(searchKey.encode('utf-8')) if value: return value return None else: nowCDB = self.mapping[0]['cdb'] for i in range(1, len(self.mapping)): nowKey = self.mapping[i]['key'] if self.numerical_keys: if int(searchKey) < int(nowKey): break else: if searchKey.encode('utf-8') < nowKey: break nowCDB = self.mapping[i]['cdb'] targetCDB = cdb.init(nowCDB) if self.repeated_keys: value = targetCDB.getall(searchKey.encode('utf-8')) else: value = targetCDB.get(searchKey.encode('utf-8')) return value
def __init__(self, key, reverse=False, cbc=False, basedir='.', debug=0): self._hmac = hmac.HMAC(key) # Defaults to MD5. self.reverse = reverse self.cbc = cbc self.debug = debug self.WORD2GROUP = cdb.init(os.path.join(basedir, 'w2g.cdb')) self.GROUP2WORDS = cdb.init(os.path.join(basedir, 'g2w.cdb')) self._a0 = None self._a1 = None return
def __init__(self, filename): self.filename = filename self.tempfile = "%s.tmp" % filename self.db = {} try: self.cdb = cdb.init(self.filename) except cdb.error: d = cdb.cdbmake(self.filename, self.tempfile) d.finish() del d self.cdb = cdb.init(self.filename)
def example_function(param): """ Example function. Keyword arguments: param -- the return value """ pages = {'Genetics': {'id': 4}, 'Other': {'id': 5}} writer = MediaWikiCdbWriter() writer.writeCdbIdFromName("../cdb/pageIdFromName.cdb", pages) pageName = "Genetics" pageIdFromName = cdb.init("../cdb/pageIdFromName.cdb") p = pageIdFromName.get(pageName) s = struct.Struct("<l") i = s.unpack(p) print "xx", i[0] d = CdbDictIdFromName("../cdb/pageIdFromName.cdb") print "yy", d['Genetics'] mpp = MyPrettyPrinter() pageProjectsFromId = CdbDictPageProjectsFromId("../cdb/pageProjectsFromId.cdb") print "pageProjects" #print pageProjectsFromId #mpp.pprint(pageProjectsFromId) d = CdbDictNameFromId("../cdb/pageNameFromId.cdb") print "CdbDictNameFromId" print d print "keys" print d.keys() print "d[]" for i in d: print i, d[i] d = CdbDictIdFromName("../cdb/pageIdFromName.cdb") print "CdbDictIdFromName" print "keys" print d.keys() print d['Genetics'] print "d[]" for i in d: print i, d[i] return print "d.keys()" for i in d.keys(): print i, d[i] print "d.interkeys()" for k in d.iterkeys(): print d[k] print "d.intervalues()" for v in d.itervalues(): print v print "d.interitems()" for k, v in d.iteritems(): print 'd[', k, '] = ', v return param
def __getitem__(self, key): try: return self.db.get( key ) except: self.db = cdb.init( self.fn ) return self.db.get( key )
def get_sentence_by_sid(sid, sid2sent_dir): sid = sid.split('%')[0] sid_components = sid.split('-') if os.path.basename(sid2sent_dir) == "v2006-2015.text-cdb": if 'data' in sid: sid_components = [x for x in sid_components if x != ""] sub_dirs = [sid_components[0], sid_components[1]] + list( sid_components[2][:3]) + [sid_components[2][:4]] sub_dir_str = "/".join(sub_dirs) else: sub_dirs = [sid_components[0]] + list( sid_components[1][:3]) + [sid_components[1][:4]] sub_dir_str = "/".join(sub_dirs) else: #if os.path.basename(sid2sent_dir) == "tsubame.results.orig-cdb": sub_dirs = [ sid_components[0], sid_components[1][:4], sid_components[1][:6] ] sub_dir_str = "/".join(sub_dirs) try: sid2sent = "%s/%s.cdb" % (sid2sent_dir, sub_dir_str) SID2SENT = cdb.init(sid2sent.encode('utf-8')) sent = SID2SENT.get(sid) if sent == None: sys.stderr.write("Cannot retrieve sentence of sid:%s.\n" % sid) return sent except: sys.stderr.write("Cannot retrieve sentence of sid:%s.\n" % sid)
def __init__(self, filename): self.filename = filename self.struct = struct.Struct("<l") # "<l" is 32bit little endian integer if not os.path.exists(self.filename): open(filename, "w").close() self.cdb = cdb.init(filename) dict.__init__(self)
def __init__(self, name, converter): super(CachedDB, self).__init__() mcdb_name = name[:-4] + '.mcdb' if os.path.exists(name): print("CDB: opening", name) self.db = cdb.init(name) self.db_contains = lambda key: self.db.has_key(key) elif os.path.exists(mcdb_name): print("MCDB: opening", mcdb_name) self.db = mcdb.read(mcdb_name) self.db_contains = lambda key: self.db.get(key, _missing ) is not _missing else: raise ValueError("Unknown file: %s" % (name, )) if converter in [int, float]: if converter == int: s = struct.Struct('<Q').unpack else: s = struct.Struct('<f').unpack def c(v, s=s): return s(v)[0] self.converter = c elif converter == 'blosc_to_list': self.converter = blosc_decompress_int_list self.cache = {}
def __getitem__(self, key): try: return self.db.get(key) except: self.db = cdb.init(self.fn) return self.db.get(key)
def sync(self, force=False): if not self.db: return tmp = cdb.cdbmake(self.filename, self.tempfile) # Copy original r = self.cdb.each() while r: k, v = r dk = decode(k) if k not in self.db: tmp.add(*r) r = self.cdb.each() # Add new stuff for k, l in self.db.iteritems(): for v in l: try: tmp.add(k, v) except: print(k, v) raise tmp.finish() self.cdb = cdb.init(self.filename) self.db = {}
def search_cdbs(cdbs, key, save_null=False): hits = [] for this_cdb in cdbs: this_cdb = cdb.init(this_cdb) this_hits = this_cdb.get(key) if this_hits != None or save_null: hits.append(this_hits) return hits
def clear(self): """Remove all entries from the dictionary.""" os.remove(self.filename) open(self.filename, "w").close() maker = cdb.cdbmake(self.filename, self.filename + ".tmp") maker.finish() del(maker) self.cdb = cdb.init(self.filename)
def iteritems(self): c = cdb.init(self._name_db) r = c.each() while r: item = Item(self, r[0].decode('utf-8')) item._fs_item_id = r[1] yield item r = c.each()
def _get_item_id(self, itemname): """ Get ID of item (or None if no such item exists) @param itemname: name of item (unicode) """ c = cdb.init(self._name_db) return c.get(itemname.encode('utf-8'))
def printCdbFromIdFile(self, filename): print "\nfile:" + filename c = cdb.init(filename) k = c.firstkey() while k is not None: v = c.get(k) i = struct.unpack("<l", k) print hex(i[0]), "=>", v k = c.nextkey()
def cdb_read_proc (file_cdb): dict_aa = {} cdb_o = cdb.init (file_cdb) rr = cdb_o.each() while rr: unit = json.loads (rr[1]) dict_aa[rr[0]] = unit rr = cdb_o.each() # return dict_aa
def __init__(self, config, bli): self.base_dir = config.base_dir self.db_backend = config.db_backend self.cache = bli.cache self.cdb_cache = dict() if self.db_backend == "ram": pass elif self.db_backend == "cdb": for blacklist in self.cache: self.cdb_cache[blacklist] = cdb.init(blacklist) self.loop()
def update(self, values): """Add values to the dictionary.""" maker = cdb.cdbmake(self.filename, self.filename + ".tmp") for i in values: # add key,value maker.add(self._pack_key(i), self._pack_value(values[i])) print "Added %d records to CDB %s (fd %d)" \ % (maker.numentries, maker.fn, maker.fd) maker.finish() del(maker) self.cdb = cdb.init(self.filename)
def get_orig_sentence(sid): sid = sid.split('%')[0] sub_dir = sid.split('-')[0] sub_dir2 = sid.split('-')[1][:4] sub_dir3 = sid.split('-')[1][:6] file_loc = "%s/%s/%s/%s.cdb" % (ORIG_DIR, sub_dir, sub_dir2, sub_dir3) #sys.stderr.write(file_loc+ '\n') F = cdb.init(file_loc) sent = F.get(sid) #sys.stderr.write(sent + '\n') return sent
def get(self, searchKey, exhaustive=False): if exhaustive: for i in range(len(self.mapping)): nowCDB = cdb.init(self.mapping[i]["cdb"]) value = nowCDB.getall(searchKey) if value: return value return None else: nowCDB = self.mapping[-1]["cdb"] for i in range(1, len(self.mapping)): nowKey = self.mapping[i]["key"] if searchKey.encode("utf-8") < nowKey: nowCDB = self.mapping[i - 1]["cdb"] break targetCDB = cdb.init(nowCDB) value = ( targetCDB.getall(searchKey.encode("utf-8")) if self.repeated_keys else targetCDB.get(searchKey.encode("utf-8")) ) return value
def test_reducer(self): red = CDBReducer() output = red(zip('abcde', '12345')) fn = mkstemp()[1] fo = open(fn, 'wb') fo.writelines(v for k, v in output) fo.close() db = cdb.init(fn) self.assertEqual([(k, db[k]) for k in db.keys()], [('a', '1'), ('b', '2'), ('c', '3'), ('d', '4'), ('e', '5')]) os.remove(fn)
def __init__(self, dir): self.struct = struct.Struct("<l") self.pageIdFromName = cdb.init(dir + "pageIdFromName.cdb") self.pageNameFromId = cdb.init(dir + "pageNameFromId.cdb") self.pageLinksFromId = cdb.init(dir + "pageLinksFromId.cdb") self.pageProjectsFromId = cdb.init(dir + "pageProjectsFromId.cdb") self.projectIdFromName = cdb.init(dir + "projectIdFromName.cdb") self.projectNameFromId = cdb.init(dir + "projectNameFromId.cdb")
def __init__(self, fname, userdict=None): if fname.endswith(".cdb"): if not cdb: raise RuntimeError("cdb is not supported.") self.dict = cdb.init(fname) else: self.dict = {} fp = file(fname) while True: s = fp.readline() if not s: break f = s.split("\t") self.dict[f[0]] = f[1] return
def test_default(self): proc = CDBFactory() self.assertEqual(proc('k1', ['v1']), None) self.assertEqual(proc('k2', ['v2', 'v3']), None) chunks = proc.close() fn = mkstemp()[1] fo = open(fn, 'wb') for chk in chunks: self.assertTrue(len(chk) <= proc.chunksize) fo.write(chk) fo.close() db = cdb.init(fn) self.assertEqual([(k, db[k]) for k in db.keys()], [('k1', 'v1'), ('k2', 'v2')]) os.remove(fn)
def phase_1(pkt): if pkt.haslayer(Dot11): if pkt.type == 0 and pkt.subtype in (0, 2, 4): if pkt.addr2 not in clients: vendor_id = pkt.addr2[0:8] upper_case = str(vendor_id).upper() db_name = "mac_address_db" db = cdb.cdbmake("../lib/" + db_name, "../lib/"+ db_name + ".tmp") del db db = cdb.init("../lib/" + db_name) match = db.get(upper_case) print("{:<6s}{:>13}{:>12s}".format(str(len(clients) + 1), pkt.addr2, match)) clients.append(pkt.addr2) vendors.append(match)
def load_data(): global definitions global wiktionary_definitions global redirect for x in open(os.path.join(DATA_PATH, 'simple_wiki_fs.txt')): x = x.strip() a, b = x.split('\t') definitions[lower_string(a)] = b for x in open(os.path.join(DATA_PATH, 'simple_wiki_redirect.txt')): a, b = lower_string(x).split() redirect[key_title(a)] = key_title(b) wiktionary_definitions = cdb.init(os.path.join(DATA_PATH, 'wiktionary.cdb'))
def replace_word(key, class_num): all_noun = [] for p in WR_POSTFIX: WR = "%s%s" % (WORD_REPLACE, p) F = cdb.init(WR) noun = F.get(key) if noun != None: all_noun.extend(noun.rstrip().split('|')) rtn = [] for noun in all_noun: now_class, nounList = noun.split('-') nounList = nounList.split(':') if now_class == class_num: nounList = map(lambda x: x.split('#')[0], nounList) rtn.extend(nounList) return rtn
def openCDB(self): prevmask = os.umask(0) if not os.path.exists(self.path): os.makedirs(self.path, 02775) os.chown(self.path, self.uid, self.gid) if not os.path.isfile(self.cdbName): maker = cdb.cdbmake(self.cdbName, self.cdbName + ".tmp") maker.finish() del maker os.chown(self.cdbName, self.uid, self.gid) os.chmod(self.cdbName, 0664) os.umask(prevmask) self.cdbObject = cdb.init(self.cdbName)
def command(self): "command" self.init() filename = self.options.filename if filename is None: print "\nThe cdb filename is required\n" print self.parser.print_help() sys.exit(2) if not os.path.isfile(filename): print "\nThe cdb filename %s does not exist\n" % filename print self.parser.print_help() sys.exit(2) print "*" * 10, 'Dumping: %s' % filename, "*" * 10 cdbo = cdb.init(filename) cdbdump(cdbo)
def _destroy_item_locked(self, item): c = cdb.init(self._name_db) maker = cdb.cdbmake(self._name_db + '.ndb', self._name_db + '.tmp') r = c.each() while r: i, v = r if v != item._fs_item_id: maker.add(i, v) r = c.each() maker.finish() filesys.rename(self._name_db + '.ndb', self._name_db) path = os.path.join(self._path, item._fs_item_id) try: shutil.rmtree(path) except OSError, err: raise CouldNotDestroyError("Could not destroy item '%r' [errno: %d]" % ( item.name, err.errno))
def sid_to_sentence(self, sid): sid = sid.split(':')[-1] sub_dirs = sid.split('-') if sub_dirs[0] == "w201103": if sub_dirs[1] == "": sub_dirs[0] = "w201103.old/%s" % (sub_dirs[2]) sub_dirs.pop(1) else: sub_dirs[0] = "w201103/%s" % sub_dirs[1] sub_dirs.pop(1) which_cdb = "%s/%s/%s/%s.cdb" % (self.sentence_cdb_dir, sub_dirs[0], "/".join(sub_dirs[1][:3]), sub_dirs[1][:4]) if not os.path.isfile(which_cdb): sys.stderr.write("cdb file not found for %s.\n" % sid) return None c = cdb.init(which_cdb) return c[sid]
def __search_cdb(self, pathname, keys, actions, source): """ Search DJB's constant databases; see <http://cr.yp.to/cdb.html>. """ import cdb cdb = cdb.init(pathname) found_match = 0 for key in keys: if key and key.lower() in cdb: found_match = 1 cdb_value = cdb[key.lower()] # If there is an entry for this key, # we consider it an overriding action # specification. if cdb_value: actions.clear() actions.update(self.__buildactions(cdb_value, source)) break return found_match
def __search_cdb(self, pathname, keys, actions, source): """ Search DJB's constant databases; see <http://cr.yp.to/cdb.html>. """ import cdb cdb = cdb.init(pathname) found_match = 0 for key in keys: if key and cdb.has_key(string.lower(key)): found_match = 1 cdb_value = cdb[string.lower(key)] # If there is an entry for this key, # we consider it an overriding action # specification. if cdb_value: actions.clear() actions.update(self.__buildactions(cdb_value, source)) break return found_match
def check_oldpw(accountname, oldpw): passwd_dbfile=os.path.abspath(home_dir+"/passwd.cdb"); try: db=cdb.init(passwd_dbfile) except: return 'No user database found.' try: cdb_user_data=db[accountname] except: return 'User not found or password incorrect.' passhash=cdb_user_data[6:40] # Hash algorithm is given between first two $ of passhash (here only md5 based BSD password is used) hashtype='1' # Salt is given between next two $ salt=passhash[3:11] opensslargs = ['openssl', 'passwd', '-'+hashtype, '-salt', salt, oldpass]; newhash = check_output(opensslargs).strip(); if newhash == passhash: return '' return 'User not found or password incorrect.'
def _rename_item_locked(self, arg): item, newname = arg nn = newname.encode('utf-8') npath = os.path.join(self._path, item._fs_item_id, 'name') c = cdb.init(self._name_db) maker = cdb.cdbmake(self._name_db + '.ndb', self._name_db + '.tmp') r = c.each() while r: i, v = r if i == nn: raise ItemAlreadyExistsError("Target item '%r' already exists!" % newname) elif v == item._fs_item_id: maker.add(nn, v) else: maker.add(i, v) r = c.each() maker.finish() filesys.rename(self._name_db + '.ndb', self._name_db) nf = open(npath, mode='wb') nf.write(nn) nf.close()
def __init__(self, pathKB="/work1/t2g-13IAM/13IAM511/extkb"): self.cdbTuples = cdb.init(os.path.join(pathKB, "tuples.simple.cdb")) self.totalFreq = float(open(os.path.join(pathKB, "tuples.simple.totalfreq.txt")).read())
g_wep = [re.compile(x.strip()[1:-1]) if x.startswith("/") else re.compile("^%s$" % x.strip()) for x in open(_myfile("weak-evident-preds.txt")) if not x.startswith("#")] g_pnp = [x.strip() for x in open(_myfile("proper-name-preds.txt"))] g_prnp = [x.strip() for x in open(_myfile("pronoun-preds.txt"))] g_mp = [x.strip() for x in open(_myfile("modality-preds.txt"))] g_handinc = dict([(x.strip(), 1) for x in open(_myfile("incompatible.txt"))]) # print >>sys.stderr, "Loading schema..." g_schema = {} if os.path.exists(_myfile("schemas-size12.cdb")): print >>sys.stderr, "Using cache!" g_schema = cdb.init(_myfile("schemas-size12.cdb")) else: if "schema" in pa.caching: maker = cdb.cdbmake( _myfile("schemas-size12.cdb"), _myfile("schemas-size12.cdb.tmp") ) schema_id = 0 for score, events, event_scores, roles in re.findall( "\*\*\*\*\*\nscore=([-0-9.]+)\nEvents: (.*?)\nScores: (.*?)\n(.*?)\n\n", open( _myfile("schemas-size12") ).read(), re.MULTILINE|re.DOTALL ): schema_id += 1 scores_dict = {} for i, e in enumerate(events.split()): scores_dict[e] = float(event_scores.split()[i]) role_id = 0
def write_pairs(self, f1, f2): '''Parse through two paired files and only write if both pairs are present''' def intersect(a, b): '''Intesection between lists''' return list(set(a) & set(b)) def rm_files(patterns): '''Remove files using glob given as list of patterns''' import glob import os for p in patterns: files = glob.glob(p) if len(files) == 0: pass else: map(os.remove, files) def write_out(db_common, f, o): '''Write out reads''' if self.gz: fh = open(f, 'r') out = gzip.open(o + '.gz', 'wb') else: fh = open(f, 'r') out = open(o, 'w') written_count = 0 total_count = 0 for (title, sequence, quality) in FastqGeneralIterator(fh): total_count += 1 if db_common.has_key(title[:-2]): out.write('@%s\n%s\n+\n%s\n' % (title, sequence, quality)) written_count += 1 sys.stderr.write('%s: Total %i, Written %i (%.1f%%)\n' % (f, total_count, written_count, written_count / total_count * 100)) fh.close() out.close() def create_db(f, db_fname): '''Write out db of headers''' fh = open(f, 'r') fh_headers = (x.strip()[1:-2] for i, x in enumerate(fh) if not (i % 4)) db = cdb.cdbmake(db_fname, db_fname + '.tmp') for h in fh_headers: db.add(h, 'T') db.finish() del (db) ## get headers from both trimmed files ## # strip the /2 or /1 and grab only the headers # write in dbm to minimze memory usage # create db's (parallel) rand = ''.join( random.choice(string.ascii_uppercase + string.digits) for x in range(36)) db1_fname = 'db1_%s' % rand db2_fname = 'db2_%s' % rand jobs = [] p = multiprocessing.Process(target=create_db, args=( f1, db1_fname, )) p.start() jobs.append(p) p = multiprocessing.Process(target=create_db, args=( f2, db2_fname, )) p.start() jobs.append(p) # wait for jobs to finish for job in jobs: job.join() ## get headers that are in both trimmed files ## db1 = cdb.init(db1_fname) db2 = cdb.init(db2_fname) common = intersect(db1.keys(), db2.keys()) dbcommon_fname = 'dbcommon_%s' % rand db_common = cdb.cdbmake(dbcommon_fname, dbcommon_fname + '.tmp') for h in common: db_common.add(h, 'T') db_common.finish() del (db_common) ## get headers that are in only one trimmed file ## symdiff = set(db1.keys()).symmetric_difference(set(db2.keys())) dbdiff_fname = 'dbdiff_%s' % rand db_diff = cdb.cdbmake(dbdiff_fname, dbdiff_fname + '.tmp') for h in symdiff: db_diff.add(h, 'T') db_diff.finish() del (db_diff) ## open common db ## db_common = cdb.init(dbcommon_fname) jobs = [] p = multiprocessing.Process(target=write_out, args=(db_common, f1, self.o[0])) p.start() jobs.append(p) p = multiprocessing.Process(target=write_out, args=(db_common, f2, self.o[1])) p.start() jobs.append(p) ## open single db ## self.single = [self.o[0] + '.single', self.o[1] + '.single'] db_diff = cdb.init(dbdiff_fname) p = multiprocessing.Process(target=write_out, args=(db_diff, f1, self.single[0])) p.start() jobs.append(p) p = multiprocessing.Process(target=write_out, args=(db_diff, f2, self.single[1])) p.start() jobs.append(p) # wait for jobs to finish for job in jobs: job.join() rm_files([db1_fname, db2_fname, dbcommon_fname, dbdiff_fname, f1, f2])
import cdb inputfile = "morepork-dropout-3e116ed1-i15-h399-o2-b1-8000Hz-w512.net" db = cdb.init(inputfile) for k in db.keys(): print k
g_wep = [re.compile(x.strip()[1:-1]) if x.startswith("/") else re.compile("^%s$" % x.strip()) for x in open(_myfile("../data/weak-evident-preds.txt")) if not x.startswith("#")] g_pnp = [x.strip() for x in open(_myfile("../data/proper-name-preds.txt"))] g_prnp = [x.strip() for x in open(_myfile("../data/pronoun-preds.txt"))] g_mp = [x.strip() for x in open(_myfile("../data/modality-preds.txt"))] g_handinc = dict([(x.strip(), 1) for x in open(_myfile("../data/incompatible.txt"))]) # print >>sys.stderr, "Loading schema..." g_schema = {} if os.path.exists(_myfile("../data/schemas-size12.cdb")): print >>sys.stderr, "Using cache!" g_schema = cdb.init(_myfile("../data/schemas-size12.cdb")) else: if "schema" in pa.caching: maker = cdb.cdbmake( _myfile("../data/schemas-size12.cdb"), _myfile("../data/schemas-size12.cdb.tmp") ) schema_id = 0 for score, events, event_scores, roles in re.findall( "\*\*\*\*\*\nscore=([-0-9.]+)\nEvents: (.*?)\nScores: (.*?)\n(.*?)\n\n", open( _myfile("../data/schemas-size12") ).read(), re.MULTILINE|re.DOTALL ): schema_id += 1 scores_dict = {} for i, e in enumerate(events.split()): scores_dict[e] = float(event_scores.split()[i]) role_id = 0
def keys(self): try: return self.db.keys() except: self.db = cdb.init(self.fn) return self.db.keys()
def __init__(self, cdbname): CMap.__init__(self) self.cdbname = cdbname self.db = cdb.init(cdbname) return
try: os.mkdir(ipath) done = True except OSError, err: if err.errno != errno.EEXIST: raise if cntr > 2 and not done and self._itemspace <= 2 ** 31: self._itemspace *= 2 cntr = 0 elif cntr > 20: # XXX: UnexpectedBackendError() that propagates to user? raise Exception('Item space full!') nn = item.name.encode('utf-8') c = cdb.init(self._name_db) maker = cdb.cdbmake(self._name_db + '.ndb', self._name_db + '.tmp') r = c.each() while r: i, v = r if i == nn: # Oops. This item already exists! Clean up and error out. maker.finish() os.unlink(self._name_db + '.ndb') os.rmdir(ipath) if newrev is not None: os.unlink(newrev) raise ItemAlreadyExistsError("Item '%r' already exists!" % item.name) else: maker.add(i, v) r = c.each()
def __init__(self, filename): self.db = cdb.init(filename)
def main(): reader_format = 'pb' writer_format = 'pb' delim = '\t' fields = [] key = None typename = "" pb2file = None indextype = None indexreader = None indexfile = None fin = None fout = sys.stdout infile = None verbose = 0 opts, args = getopt.getopt(sys.argv[1:], 'R:W:F:d:p:k:i:x:t:m:v') for o, a in opts: if o == '-R': reader_format = a elif o == '-W': writer_format = a elif o == '-F': fields = a.split(',') elif o == '-d': delim = a elif o == '-p': pb2file = a elif o == '-m': typename = a elif o == '-k': key = a elif o == '-i': indextype = a elif o == '-x': indexfile = a elif o == '-v': verbose += 1 if key == None: raise Exception("missing key parameter, specify with -k") if not len(args): raise Exception("missing input data file argument") infile = shift(args) fin = file(infile) # create the index reader object if indextype == 'cdb': import cdb indexreader = cdb.init( indexfile ) elif indextype == None: raise Exception("missing index type parameter, specify with -i") # initialize reader / writer codecs if pb2file: import lwpb.codec pb2codec = lwpb.codec.MessageCodec( pb2file=pb2file, typename=typename ) if len(fields): import percent.codec txtcodec = percent.codec.PercentCodec( fields, delim ) # create the stream reader if reader_format == 'pb': import lwpb.stream reader = lwpb.stream.StreamReader( fin, codec=pb2codec ) elif reader_format == 'txt': import percent.stream reader = percent.stream.PercentCodecReader( fin, txtcodec ) else: raise Exception("bad reader format") # create the stream writer if writer_format == 'pb': import lwpb.stream writer = lwpb.stream.StreamWriter( fout, codec=pb2codec ) elif writer_format == 'txt': import percent.stream writer = percent.stream.PercentCodecWriter( fout, txtcodec ) else: raise Exception("bad writer format") # lookup, read, and write records for line in sys.stdin: indexkey = line.strip('\r\n') for indexvalue in indexreader.getall(indexkey): offset = long( indexvalue ) fin.seek( offset, os.SEEK_SET ) record = reader.read() writer.write( record ) return 0
def _get_reader(self, **kwargs): self.db.finish() return cdb.init(self.cdb_path.encode('utf-8'), **kwargs)
def __init__(self, filename): self.db = cdb.init(filename.encode('utf8'))
#!/usr/bin/env python # check the contents of the PDF url mapping table. import cdb db = cdb.init('omega/cdb/pdfurl') for key in db.keys(): print key,' ',db.get(key)
def write_pairs(self, f1, f2): '''Parse through two paired files and only write if both pairs are present''' def intersect(a, b): '''Intesection between lists''' return list(set(a) & set(b)) def rm_files(patterns): '''Remove files using glob given as list of patterns''' import glob import os for p in patterns: files = glob.glob(p) if len(files) == 0: pass else: map(os.remove, files) def write_out(db_common, f, o): '''Write out reads''' if self.gz: fh = open(f, 'r') out = gzip.open(o+'.gz', 'wb') else: fh = open(f, 'r') out = open(o, 'w') written_count = 0 total_count = 0 for (title, sequence, quality) in FastqGeneralIterator(fh): total_count += 1 if db_common.has_key(title[:-2]): out.write('@%s\n%s\n+\n%s\n' % (title, sequence, quality)) written_count += 1 sys.stderr.write('%s: Total %i, Written %i (%.1f%%)\n' % (f, total_count, written_count, written_count/total_count*100)) fh.close() out.close() def create_db(f, db_fname): '''Write out db of headers''' fh = open(f, 'r') fh_headers = (x.strip()[1:-2] for i, x in enumerate(fh) if not (i % 4)) db = cdb.cdbmake(db_fname, db_fname + '.tmp') for h in fh_headers: db.add(h, 'T') db.finish() del(db) ## get headers from both trimmed files ## # strip the /2 or /1 and grab only the headers # write in dbm to minimze memory usage # create db's (parallel) rand = ''.join(random.choice(string.ascii_uppercase + string.digits) for x in range(36)) db1_fname = 'db1_%s' % rand db2_fname = 'db2_%s' % rand jobs = [] p = multiprocessing.Process(target=create_db, args=(f1, db1_fname, )) p.start() jobs.append(p) p = multiprocessing.Process(target=create_db, args=(f2, db2_fname, )) p.start() jobs.append(p) # wait for jobs to finish for job in jobs: job.join() ## get headers that are in both trimmed files ## db1 = cdb.init(db1_fname) db2 = cdb.init(db2_fname) common = intersect(db1.keys(), db2.keys()) dbcommon_fname = 'dbcommon_%s' % rand db_common = cdb.cdbmake(dbcommon_fname, dbcommon_fname + '.tmp') for h in common: db_common.add(h, 'T') db_common.finish() del(db_common) ## get headers that are in only one trimmed file ## symdiff = set(db1.keys()).symmetric_difference(set(db2.keys())) dbdiff_fname = 'dbdiff_%s' % rand db_diff = cdb.cdbmake(dbdiff_fname, dbdiff_fname + '.tmp') for h in symdiff: db_diff.add(h, 'T') db_diff.finish() del(db_diff) ## open common db ## db_common = cdb.init(dbcommon_fname) jobs = [] p = multiprocessing.Process(target=write_out, args=(db_common, f1, self.o[0])) p.start() jobs.append(p) p = multiprocessing.Process(target=write_out, args=(db_common, f2, self.o[1])) p.start() jobs.append(p) ## open single db ## self.single = [self.o[0]+'.single', self.o[1]+'.single'] db_diff = cdb.init(dbdiff_fname) p = multiprocessing.Process(target=write_out, args=(db_diff, f1, self.single[0])) p.start() jobs.append(p) p = multiprocessing.Process(target=write_out, args=(db_diff, f2, self.single[1])) p.start() jobs.append(p) # wait for jobs to finish for job in jobs: job.join() rm_files([db1_fname, db2_fname, dbcommon_fname, dbdiff_fname, f1, f2])
def lookup_by_id(config, eon_id): name = os.path.join(config.get("broker", "grn_to_eonid_map_location"), "eon_catalog_by_id.cdb") cdb_file = cdb.init(name) return cdb_file.has_key(str(eon_id))
def __init__(self, path = "/work/naoya-i/kb/"): self.w2vi = cdb.init(os.path.join(path, "GoogleNews-vectors-negative300.index.cdb")) self.w2vf = open(os.path.join(path, "GoogleNews-vectors-negative300.bin"), "rb") self.w2vdb = mmap.mmap(self.w2vf.fileno(), 0, prot=mmap.PROT_READ)