예제 #1
0
파일: test_plyvel.py 프로젝트: ecdsa/plyvel
def test_destroy_db():
    with tmp_db('destroy', create=False, delete=False) as name:
        db = DB(name, create_if_missing=True)
        db.put(b'foo', b'bar')
        db.close()
        del db

        plyvel.destroy_db(name)
        assert not os.path.lexists(name)
예제 #2
0
파일: test_plyvel.py 프로젝트: ecdsa/plyvel
def test_repair_db():
    with tmp_db('repair', create=False) as name:
        db = DB(name, create_if_missing=True)
        db.put(b'foo', b'bar')
        db.close()
        del db

        plyvel.repair_db(name)
        db = DB(name)
        assert_equal(b'bar', db.get(b'foo'))
예제 #3
0
def main():
    db = DB("/home/xinyang/Datasets/Wikipedia/enwiki_leveldb/")
    reader = csv.reader(open(DATA_PATH, encoding="latin1"))

    for pid, name, num_pages, num_subcats, num_files in tqdm(reader):
        name = name.encode()
        dobj = dict(num_pages=int(num_pages),
                    num_subcats=int(num_subcats),
                    num_files=int(num_files))
        dobj_dumped = ujson.dumps(dobj).encode()

        db.put(PREFIX + name, dobj_dumped)
예제 #4
0
def store_city_name_id(data, db_name='imd_city_db'):
    '''
        Stores City Names, IDs and correspoding links into a local levelDB.
        City ID is used as key value.
    '''
    resp = {}
    try:
        db_handle = DB(db_name, create_if_missing=True)
        data = __format_db_entry__(data)
        for i, j in data.items():
            db_handle.put(i, j)
        db_handle.close()
        resp = {'status': 'success'}
    except plError as e:
        resp = {'status': str(e)}
    except Exception as e:
        resp = {'status': str(e)}
    return resp
예제 #5
0
파일: test_plyvel.py 프로젝트: ecdsa/plyvel
def test_open_close():
    with tmp_db('open_close', create=False) as name:
        # Create a database with options that result in additional
        # object allocation (e.g. LRU cache).
        db = DB(name,
                create_if_missing=True,
                lru_cache_size=1024 * 1024,
                bloom_filter_bits=10)
        db.put(b'key', b'value')
        wb = db.write_batch()
        sn = db.snapshot()
        it = db.iterator()
        snapshot_it = sn.iterator()

        # Close the database
        db.close()
        assert db.closed

        # Expect runtime errors for operations on the database,
        with assert_raises(RuntimeError):
            db.get(b'key')
        with assert_raises(RuntimeError):
            db.put(b'key', b'value')
        with assert_raises(RuntimeError):
            db.delete(b'key')

        # ... on write batches,
        with assert_raises(RuntimeError):
            wb.put(b'key', b'value')

        # ... on snapshots,
        assert_raises(RuntimeError, db.snapshot)
        with assert_raises(RuntimeError):
            sn.get(b'key')

        # ... on iterators,
        with assert_raises(RuntimeError):
            next(it)

        # ... and on snapshot iterators,
        with assert_raises(RuntimeError):
            next(snapshot_it)
예제 #6
0
total_blocks = 1024 * 10
block_size = 1024
ev_inodes, ev_blocks = (0, 0)
db = DB('/home/cujo/nfs/db/db2',
        create_if_missing=True,
        block_size=int(sys.argv[1]))
_, current_blocks = update_vfs(block_size, total_blocks)

for i in range(1024):
    bytes_written = 0
    inode = INode()
    for k in range(len(inode.f_blocks) - 1):
        block_number = blocks_sample.pop()
        inode.f_blocks[k] = block_number
        bytes_written += populate_block(block_number)
        ev_blocks += 1
        _, current_blocks = update_vfs(block_size, current_blocks)
    inode.f_size = bytes_written
    db.put(b'i_' + bytes(i), dumps(inode.__dict__))
    ev_inodes += 1
    _, current_blocks = update_vfs(block_size, current_blocks)
db.put(b'fb', bytes(blocks_sample))
ev_blocks += 1
_, current_blocks = update_vfs(block_size, current_blocks)
db.close()

# Some Stats collection for evaluation
print('Total Blocks inserted = ', ev_blocks)
print('Total INodes inserted = ', ev_inodes)
#!/usr/bin/env python
import csv
from collections import defaultdict

import ujson
from plyvel import DB

CATEGORYLINKS_PATH = "/home/xinyang/Datasets/Wikipedia/csv/enwiki-20180101-categorylinks-clean.csv"
PREFIX = b"category_pageids_"

if __name__ == "__main__":
    db = None

    cate_pageids = defaultdict(list)
    with open(CATEGORYLINKS_PATH) as fobj:
        reader = csv.reader(fobj)
        for row in reader:
            if row[2] != 'page':
                continue

            page_id, cat_name = row[0], row[1]
            cate_pageids[row[1]].append(row[0])

    try:
        db = DB("/home/xinyang/Datasets/Wikipedia/enwiki_leveldb")
        for cate_name, page_ids in cate_pageids.items():
            db.put(PREFIX + cate_name.encode(), ujson.dumps(page_ids).encode())
    finally:
        db.close()
class FeatureSelector( Frontend ):


  def __init__( self, fn, mode ):    

    Frontend.__init__( self, fn, mode );

    self._kdbfn = None;
    self._kdb = None;

    self._ldbdn = None;
    self._ldb = None;

    self._len_c = None;
    self._len_b = None;
    self._len_x = None;

    self._ic = None;
    self._icbp = None;

    self._needs_initialization = True;

    self._core_dims = set();
    self._satellite_dims = set();
    self._removed_dims = set();

    self._remove_c = set();
    self._remove_b = set();
    self._remove_x = set();

    self.bypass_c = False;
    self.bypass_b = False;
    self.bypass_x = False;


  def __enter__( self ):

    if self._mode == "r":
      with open( self._fn, "rb" ) as f:
        state = pickle_load( f );
        self._len_c = state[ "c" ];
        self._len_b = state[ "b" ];
        self._len_x = state[ "x" ];
        self._lenrow = self._len_c + self._len_b + self._len_x;
        self._ic = state[ "ic" ];
        self._icbp = state[ "icbp" ];

    if self._mode == "w":

      with NamedTemporaryFile() as tmpfn:
        self._kdbfn = tmpfn.name + '.kch';
      self._kdb = KDB();
      try:
        assert self._kdb.open( self._kdbfn, KDB.OWRITER | KDB.OCREATE );
      except:
        print( str( self._kdb.error() ) );
        raise;

      with TemporaryDirectory() as tmpdirname:
        self._ldbdn = tmpdirname;
      self._ldb = LDB( self._ldbdn, create_if_missing=True );

    return self;

  def __exit__( self, exc_type, exc_value, traceback ):

    assert Frontend.__exit__( self, exc_type, exc_value, traceback ) == False;

    if self._ldb is not None:
      sleep( 3.0 );
      self._ldb.close()

    if self._ldbdn is not None:
      rmtree( self._ldbdn );

    if self._kdb is not None:
      try:
        assert self._kdb.close();
      except:
        print( str( self._kdb.error() ) );
        raise;

    if self._kdbfn is not None:
      remove( self._kdbfn );


  def train( self, row ):

    ( y, c, b, x ) = row;

    if self._len_c is None:
      self._len_c = len(c);
    assert self._len_c == len(c);

    if self._len_b is None:
      self._len_b = len(b);
    assert self._len_b == len(b);

    if self._len_x is None:
      self._len_x = len(x);
    assert self._len_x == len(x);

    row = c + b + x;

    if Frontend.train( self, row ):
      return True;

    keyfmt = '>IIIII';

    for i in range( 0, self._lenrow ):
      for j in range( 0, self._lenrow ):

        if ( i >= j ) and ( not ( i == self._lenrow-1 ) ):
          continue;

        key = pack( keyfmt, i, j, y, row[i], row[j] );

        try:
          assert self._kdb.increment( key, 1, 0 );
        except:
          print( str(self._kdb.error()) );
          raise;


  def _stats( self, cnt_by_a, cnt_by_b, cnt_by_ab ):

    h_a = 0.0;
    h_b = 0.0;
    h_ab = 0.0;

    for ( val_a, cnt ) in cnt_by_a.items():
      p = float(cnt) / float(self._rowcount);
      if p > 0.0:      
        h_a -= p * log( p, 2.0 );

    for ( val_b, cnt ) in cnt_by_b.items():
      p = float(cnt) / float(self._rowcount);
      if p > 0.0:      
        h_b -= p * log( p, 2.0 );

    for( (val_a,val_b), cnt ) in cnt_by_ab.items():
      p = float(cnt) / float(self._rowcount);
      if p > 0.0:      
        h_ab -= p * log( p, 2.0 );

    if h_a == 0.0:
      return 1.0;

    if h_b == 0.0:
      return 1.0;
    
    mi = h_a + h_b - h_ab;
    return ( mi / min( h_a, h_b ), h_a, h_b, h_ab, mi );


  def _get_info_content_by_dimension( self, i ):

    keyfmt = '>IIIII';
    valfmt = '>Q';

    j = None;

    cnt_by_a = {};
    cnt_by_b = {};
    cnt_by_ab = {};
    total = 0;

    with self._ldb.iterator() as it:

      it.seek( pack( keyfmt, i,0,0,0,0 ) );

      for ( key, val ) in it:

        key = unpack( keyfmt, key );
        val = unpack( valfmt, val )[ 0 ];

        if not ( key[0] == i ):
          break;

        if j is None:
          j = key[1];

        if not ( key[1] == j ):
          break;

        # key[2] is the y-value
        a = key[2];

        # key[3] is the value for the i-th dimension
        b = key[3];

        cnt_by_ab[ (a,b) ] = cnt_by_ab.get( (a,b), 0 ) + val;
        cnt_by_a[ a ] = cnt_by_a.get( a, 0 ) + val;
        cnt_by_b[ b ] = cnt_by_b.get( b, 0 ) + val;

        total += val;

    try:
      assert total == self._rowcount;
    except:
      print( i, j, total, self._rowcount );
      raise;

    return self._stats( cnt_by_a, cnt_by_b, cnt_by_ab );


  def _get_info_content_by_pair( self, i, j ):

    keyfmt = '>IIIII';
    valfmt = '>Q';

    cnt_by_a = {};
    cnt_by_b = {};
    cnt_by_ab = {};
    total = 0;

    with self._ldb.iterator() as it:

      it.seek( pack( keyfmt, i,j,0,0,0 ) );

      for ( key, val ) in it:

        key = unpack( keyfmt, key );
        val = unpack( valfmt, val )[ 0 ];

        if not ( ( key[0] == i ) and ( key[1] == j ) ):
          break;

        # key[2] is the y-value, key[3] the i-th value for the i-th dim
        a = ( key[2], key[3] ); 

        # key[2] is the y-value, key[4] the i-th value for the j-th dim
        b = ( key[2], key[4] );

        assert (a,b) not in cnt_by_ab;
        cnt_by_ab[ (a,b) ] = cnt_by_ab.get( (a,b), 0 ) + val;

        cnt_by_a[ a ] = cnt_by_a.get( a, 0 ) + val;
        cnt_by_b[ b ] = cnt_by_b.get( b, 0 ) + val;

        total += val;

    assert total == self._rowcount;

    return self._stats( cnt_by_a, cnt_by_b, cnt_by_ab );


  def _finalize( self ):

    assert Frontend._finalize( self ) is None;

    if False:
      print( "unique combinations = ", self._kdb.count() );

    keyfmt = '>IIIII';
    valfmt = '>Q';

    c = self._kdb.cursor();
    c.jump();

    gt2 = 0;
    gt4 = 0;
    gt8 = 0;
    gt16 = 0;
    gt32 = 0;

    while True:

      r = c.get( True );
      if not r:
        break;

      self._ldb.put( r[0], r[1] );

      key = unpack( keyfmt, r[0] );
      val = unpack( valfmt, r[1] )[ 0 ];

      if val > 2:
        gt2 += 1;
      if val > 4:
        gt4 += 1;
      if val > 8:
        gt8 += 1;
      if val > 16:
        gt16 += 1;
      if val > 32:
        gt32 += 1;

    if False:
      print( gt2, gt4, gt8, gt16, gt32 );

    self._ic = {};
    for i in range( 0, self._lenrow ):
      self._ic[ i ] = self._get_info_content_by_dimension( i );

    self._icbp = {};

    for i in range( 0, self._lenrow ):
      for j in range( 0, self._lenrow ):

        if i >= j:
          continue;

        self._icbp[ (i,j) ] = self._get_info_content_by_pair( i, j );


    self._state \
      = { "ic": self._ic,
          "icbp": self._icbp,
          "c": self._len_c,
          "b": self._len_b,
          "x": self._len_x };


  def _fmt_dim( self, d_ ):

    d = None;
    if d_ < self._len_c:
      d = "c" + str( d_ );
    elif d_ < self._len_c + self._len_b:
      d = "b" + str( d_ - self._len_c );
    elif d_ < self._len_c + self._len_b + self._len_x:
      d = "x" + str( d_ - self._len_c - self._len_b );
    else:
      assert False;
    return "{:d}({:s})".format( d_, d );


  def _init( self ):

    self._needs_initialization = False;

    if False:

      for i in sorted( self._ic ):

        (corr,h_a,h_b,h_ab,mi) = self._ic[ i ];

        print(
            "{:s} {:1.4f} {:1.4f} {:1.4f} {:1.4f} {:1.4f}"\
             .format(
                  self._fmt_dim( i ),
                  corr,
                  h_a,
                  h_b,
                  h_ab,
                  mi
                )
          );

      for (i,j) in sorted( self._icbp ):

        (corr,h_a,h_b,h_ab,mi) = self._icbp[ (i,j) ];

        print(
            "{:s} {:s} {:1.4f} {:1.4f} {:1.4f} {:1.4f} {:1.4f}"\
             .format(
                  self._fmt_dim( i ),
                  self._fmt_dim( j ),
                  corr,
                  h_a,
                  h_b,
                  h_ab,
                  mi
                )
          );

    entropy \
      = [ ( h_ab, i ) \
          for ( i, (corr,h_a,h_b,h_ab,mi) ) in self._ic.items() ];          

    output_correlation \
      = [ ( corr, i ) \
          for ( i, (corr,h_a,h_b,h_ab,mi) ) in self._ic.items() ];

    self._core_dims = set();

    self._core_dims \
      |= { i \
           for ( h_ab, i ) \
           in sorted( entropy, reverse=True )[ :5 ] };

    self._core_dims \
      |= { i \
           for ( h_ab, i ) \
           in sorted( output_correlation, reverse=True )[ :3 ] };

    if True:
      print(
          "core = ",
          " ".join([ self._fmt_dim(d) for d in self._core_dims ])
        );

    self._satellite_dims = set();

    for core_dim in self._core_dims:

      satellite_dim = None;
      satellite_dim_c = None;
      satellite_dim_stats = None;

      for ( (i,j), (corr,h_a,h_b,h_ab,mi) ) in self._icbp.items():

        if corr <= 0.5:
          continue;

        other_dim = None;
        if i == core_dim:
          other_dim = j;
        elif j == core_dim:
          other_dim = i;
        else:
          continue;

        if ( satellite_dim_c is None ) or ( corr > satellite_dim_c ):

          satellite_dim = other_dim;
          satellite_dim_c = corr;
          satellite_dim_stats = (corr,h_a,h_b,h_ab,mi);

      if satellite_dim is not None:

        self._satellite_dims.add( satellite_dim );      

        if False:

          print(
              '->',
              self._fmt_dim(core_dim),
              self._fmt_dim(satellite_dim)
            );

          print(
              "{:1.4f} {:1.4f} {:1.4f} {:1.4f} {:1.4f}"\
               .format( *(corr,h_a,h_b,h_ab,mi) )
            );

    if True:

      print(
          "satellite = ",
          " ".join([ self._fmt_dim(d) for d in self._satellite_dims ])
        );

    self._removed_dims = set();
    for i in self._ic:
      if i not in self._core_dims and i not in self._satellite_dims:
        self._removed_dims.add( i );

    if True:

      print(
          "removed = ",
          " ".join([ self._fmt_dim(d) for d in self._removed_dims ])
        );

    for d_ in self._removed_dims:
      if d_ < self._len_c:
        self._remove_c.add( d_ );
      elif d_ < self._len_c + self._len_b:
        self._remove_b.add( d_ - self._len_c );
      elif d_ < self._len_c + self._len_b + self._len_x:
        self._remove_x.add( d_ - self._len_c - self._len_b );
      else:
        assert False;


  def apply_c( self, c ):

    if self.bypass_c:
      return c;

    if self._needs_initialization:
      self._init();

    c_ = [];
    for ( i, cval ) in enumerate( c ):
      if not i in self._remove_c:
        c_.append( cval );
    return c_;


  def apply_b( self, b ):

    if self.bypass_b:
      return b;

    if self._needs_initialization:
      self._init();

    b_ = [];
    for ( i, bval ) in enumerate( b ):
      if not i in self._remove_b:
        b_.append( bval );
    return b_;


  def apply_x( self, x ):

    if self.bypass_x:
      return x;

    if self._needs_initialization:
      self._init();

    x_ = [];
    for ( i, xval ) in enumerate( x ):
      if not i in self._remove_x:
        x_.append( xval );
    return x_;


  def __call__( self, row ):

    if self._needs_initialization:
      self._init();

    ( y, c, b, x ) = row;

    y_ = y;

    return \
      ( y_,
        self.apply_c( c ),
        self.apply_b( b ),
        self.apply_x( x ) );
예제 #9
0
from plyvel import DB
from gensim.models.word2vec import Word2Vec
import numpy as np

db_name = '../w2v_vectors'
w2v_path = '/home/legin/kudablyat/data/all.norm-sz100-w10-cb0-it1-min100.w2v'
w2v = Word2Vec.load_word2vec_format(w2v_path,
                                    binary=True,
                                    unicode_errors='ignore')
db = DB(db_name, create_if_missing=True)
i = 0
for key in w2v.vocab:
    i += 1
    if i % 1000 == 0:
        print i
    vector = w2v[key]
    db.put(key.encode('utf-8'), np.array(vector).astype(np.float16).tostring())
예제 #10
0
 def _append(self, db: plyvel.DB, key: bytes, value: bytes) -> None:
     """ Add key-value pair to DB, appending if a value already exists."""
     prev = db.get(key, default=b"", fill_cache=False)
     db.put(key, prev + value)
# In this case, it will take care of inode to filenames mapping, id of free inodes
inode0 = INode()
total_blocks = 1024 * 10
block_size = 1024

# Random unique block numbers on which data will reside
blocks_sample = sample(range(1, 8 * block_size), 8 * block_size - 1)
db = DB('/home/cujo/nfs/db/db3',
        create_if_missing=True,
        block_size=int(sys.argv[1]))

for i in range(len(file_names)):
    inode = INode()
    put_inode(inode)
    add_filename(file_names[i])
    db.put(b'i_' + bytes(i), dumps(inode.__dict__))

# Keeping track of unused blocks
# save_free_blocks()
save_used_blocks()
# Finally saving INode0
db.put(b'i_0', dumps(inode0.__dict__))
# Updating FS status
update_vfs(
    block_size,
    total_blocks - (8 * block_size - len(blocks_sample)) - len(file_names) - 1)

db.close()

# Some Stats collection for evaluation
print('Total Blocks inserted = ' +
예제 #12
0
class Snapshot(object):
    """
    use persistent method (like file, db and so on)
    to store (cache) Output of the Input,
    so we can bypass the known pair to save time/cpu/...
    """

    def __init__(self, dbpath, *args, debug=False, refresh=None, **kwargs):
        """
        :param refresh: ignore data in db and refresh using new value
        """
        super().__init__(*args, **kwargs)
        try:
            self.db = DB(dbpath, create_if_missing=True)
        except Exception as e:
            self.db = None
            raise e
        self.old_key = None
        self.upgrade = False

        if debug:
            handler.level = logging.DEBUG

        if refresh:
            self.refresh = True
        else:
            self.refresh = False

    def __del__(self):
        self.close()

    def __exit__(self):
        self.db.close()

    def __iter__(self):
        for k, v in self.db.iterator():
            yield self.recover_bytes(k), self.recover_bytes(v)

    def __contains__(self, key):
        # raise Exception('we do NOT know which one means EXIST')
        return self.get(key, None) is not None

    def __call__(self, *args, ignore=None, redos=None):
        return self.snapshot(*args, ignore, redos)

    def close(self):
        if self.db:
            self.db.close()
            self.db = None

    @staticmethod
    def to_bytes(data):
        """
        support all basic type.
        but never support recursion data, like List[Dict].
        all data will be translated to bytes if possible.

        use pickle to save bytes so we can store any possible data.
        """
        s = pickle.dumps(data)
        return s

    @staticmethod
    def recover_bytes(data):
        s = data
        return pickle.loads(s)

    def get(self, key, default=None):
        """
        user shold determine the key exist or not
        (according to the default)
        """
        logger.debug('key: {}', key, )

        key = self.to_bytes(key)
        data = self.db.get(key, default)

        if data != default:
            logger.debug('get exist: {} -> data(type={})', key, type(data))

        return data

    def get_result(self, key) -> bytes:
        """
        get the value related to the key,
        return the result by decoding it from bytes
        :param key:
        :return:
        """
        data = self.get(key)
        if data is None:
            return None
        else:
            return self.recover_bytes(data)

    def put(self, k, v):
        logger.debug('put: {} -> data(type={})', k, type(v))
        key = self.to_bytes(k)
        data = self.to_bytes(v)
        return self.db.put(key, data)

    def exist(self, key):
        return key in self

    def delete(self, k):
        key = self.to_bytes(k)
        return self.db.delete(key)

    def set_upgrade(self, *old_args):
        positions, keys = self.get_key_config(*old_args)
        self.upgrade = True
        self.old_key = positions, keys

    @staticmethod
    def get_key_config(*args):
        positions, keys = [], []
        for item in args:
            if isinstance(item, int):
                positions.append(item)
            elif isinstance(item, str):
                keys.append(item)
        return positions, keys

    def get_key(self, positions, keys, *args, **kwargs):
        logger.debug('get key from {} {} (positions:{} keys:{})',
                     args, kwargs, positions, keys, )

        key = []
        for p in positions:
            key.append(args[p])
        for k in keys:
            key.append(kwargs[k])
        return key

    def snapshot(self, *_args, ignore=None, redos=None, ignore_callback=None, redo_callback=None):
        """
        the args:
        can be number: the idx/pos of given args
        can be string: the key name in kwargs

        the kwargs:
        some config for snapshot
        """
        logger.debug('choose as key: {}', _args)
        positions, keys = self.get_key_config(*_args)

        # will ignore some return value, aka. no snapshot for it
        _ignore = ignore
        # will redo for some return value, should be a list
        _redos = redos or []

        logger.debug('choose position args: {}', positions)
        logger.debug('choose name kwargs: {}', keys)

        def do_snapshot(func):
            def is_ignore(value):
                if value == _ignore:
                    return True

                if ignore_callback and ignore_callback(value):
                    return True

                return False

            def is_redo(value):
                if value in _redos:
                    return True

                if redo_callback and redo_callback(value):
                    return True

                return False

            def worker(*args, **kwargs):
                key = self.get_key(positions, keys, *args, **kwargs)

                if self.upgrade:
                    old_key = self.get_key(
                        self.old_key[0], self.old_key[1], *args, **kwargs)
                    logger.info('will upgrade old_key: {}', old_key)
                    result = self.get(old_key)
                    if result is not None:
                        result = self.recover_bytes(result)
                        logger.info('upgrade result: {} -> {} -> {}',
                                    old_key, key, result)
                        self.delete(old_key)
                        self.put(key, result)
                        return result
                else:
                    result = self.get(key)
                    if result is None:
                        pass
                    else:
                        result = self.recover_bytes(result)

                        if is_redo(result):
                            logger.warning('redo result: {}', result)
                            logging.getLogger().warning('redo result')
                        elif self.refresh:
                            pass
                        else:
                            return result

                result = func(*args, **kwargs)
                value = result

                if is_ignore(value):
                    logger.warning('ignore result: {}', result)
                elif is_redo(value):
                    logger.warning('redo result: {}', result)
                else:
                    self.put(key, value)

                return result

            return worker

        return do_snapshot