Exemplo n.º 1
0
def test_comparator():
    def comparator(a, b):
        a = a.lower()
        b = b.lower()
        if a < b:
            return -1
        if a > b:
            return 1
        else:
            return 0

    comparator_name = b"CaseInsensitiveComparator"

    with tmp_db('comparator', create=False) as name:
        db = DB(name,
                create_if_missing=True,
                comparator=comparator,
                comparator_name=comparator_name)

        keys = [
            b'aaa',
            b'BBB',
            b'ccc',
        ]

        with db.write_batch() as wb:
            for key in keys:
                wb.put(key, b'')

        assert_list_equal(
            sorted(keys, key=lambda s: s.lower()),
            list(db.iterator(include_value=False)))
Exemplo n.º 2
0
def test_destroy_db():
    with tmp_db('destroy', create=False, delete=False) as name:
        db = DB(name, create_if_missing=True)
        db.put(b'foo', b'bar')
        db.close()
        del db

        plyvel.destroy_db(name)
        assert not os.path.lexists(name)
Exemplo n.º 3
0
def tmp_db(name_prefix, create=True, delete=True):
    name = tempfile.mkdtemp(prefix=name_prefix + '-', dir=TEST_DB_DIR)
    if create:
        db = DB(name, create_if_missing=True, error_if_exists=True)
        yield db
        db.close()
    else:
        yield name

    if delete:
        shutil.rmtree(name)
Exemplo n.º 4
0
def main():
    db = DB("/home/xinyang/Datasets/Wikipedia/enwiki_leveldb/")
    reader = csv.reader(open(DATA_PATH, encoding="latin1"))

    for pid, name, num_pages, num_subcats, num_files in tqdm(reader):
        name = name.encode()
        dobj = dict(num_pages=int(num_pages),
                    num_subcats=int(num_subcats),
                    num_files=int(num_files))
        dobj_dumped = ujson.dumps(dobj).encode()

        db.put(PREFIX + name, dobj_dumped)
  def __enter__( self ):

    if self._mode == "r":
      with open( self._fn, "rb" ) as f:
        state = pickle_load( f );
        self._len_c = state[ "c" ];
        self._len_b = state[ "b" ];
        self._len_x = state[ "x" ];
        self._lenrow = self._len_c + self._len_b + self._len_x;
        self._ic = state[ "ic" ];
        self._icbp = state[ "icbp" ];

    if self._mode == "w":

      with NamedTemporaryFile() as tmpfn:
        self._kdbfn = tmpfn.name + '.kch';
      self._kdb = KDB();
      try:
        assert self._kdb.open( self._kdbfn, KDB.OWRITER | KDB.OCREATE );
      except:
        print( str( self._kdb.error() ) );
        raise;

      with TemporaryDirectory() as tmpdirname:
        self._ldbdn = tmpdirname;
      self._ldb = LDB( self._ldbdn, create_if_missing=True );

    return self;
Exemplo n.º 6
0
 def _get_db(self, volume_id):
     try:
         db = self.dbs[volume_id]
     except KeyError:
         path = "%s/%s" % (self.db_path, volume_id)
         self.dbs[volume_id] = DB(path, create_if_missing=True)
         db = self.dbs[volume_id]
     return db
Exemplo n.º 7
0
 def _get_db(self, volume_id):
     try:
         db = self.dbs[volume_id]
     except KeyError:
         db_path = self._get_db_path(volume_id)
         self.dbs[volume_id] = DB(db_path, create_if_missing=False)
         db = self.dbs[volume_id]
     return db
Exemplo n.º 8
0
def fetch_city_name_id(city_id='', city_name='', db_name='imd_city_db'):
    '''
        City Names, IDs and corresponding links are fetched from local levelDB.
        If you pass city_id and city_name both, city_id would be chosen over city_name,
        for lookup.
        Passing only city_name, would help you to find possible matches.
        If you pass no arguments, then all available records will be returned back.
    '''
    resp = {}
    try:
        db_handle = DB(db_name, create_if_missing=True)
        if (city_id):
            if (not __validate_city_id__(city_id)):
                raise Exception('city id not validated')
            tmp = db_handle.get(city_id.encode('utf-8'), b'')
            if (tmp):
                resp.update({city_id: tmp.decode('utf-8').split(';')})
            else:
                resp = {'status': 'record not found'}
        elif (city_name):
            resp.update(__match_city_name__(city_name, db_handle.iterator()))
        else:
            itr = db_handle.iterator()
            for i, j in itr:
                resp.update({i.decode('utf-8'): j.decode('utf-8').split(';')})
            itr.close()
        db_handle.close()
    except plError as e:
        resp = {'status': str(e)}
    except Exception as e:
        resp = {'status': str(e)}
    return resp
Exemplo n.º 9
0
def test_open():
    with tmp_db('read_only_dir', create=False) as name:
        # Opening a DB in a read-only dir should not work
        os.chmod(name, stat.S_IRUSR | stat.S_IXUSR)
        with assert_raises(plyvel.IOError):
            DB(name)

    with tmp_db('úñîçøđê_name') as db:
        pass

    with tmp_db('no_create', create=False) as name:
        with assert_raises(plyvel.Error):
            DB(name, create_if_missing=False)

    with tmp_db('exists', create=False) as name:
        db = DB(name, create_if_missing=True)
        db.close()
        with assert_raises(plyvel.Error):
            DB(name, error_if_exists=True)

    with assert_raises(TypeError):
        DB(123)

    with assert_raises(TypeError):
        DB('invalid_option_types', write_buffer_size='invalid')

    with assert_raises(TypeError):
        DB('invalid_option_types', lru_cache_size='invalid')

    with assert_raises(ValueError):
        DB('invalid_compression', compression='invalid',
           create_if_missing=True)

    with tmp_db('no_compression', create=False) as name:
        DB(name, compression=None, create_if_missing=True)

    with tmp_db('many_options', create=False) as name:
        DB(name, create_if_missing=True, error_if_exists=False,
           paranoid_checks=True, write_buffer_size=16 * 1024 * 1024,
           max_open_files=512, lru_cache_size=64 * 1024 * 1024,
           block_size=2 * 1024, block_restart_interval=32,
           compression='snappy', bloom_filter_bits=10)
Exemplo n.º 10
0
 def __init__(self, path):
     self.db = DB(
         path,
         create_if_missing=True,
         lru_cache_size=10*10,
         bloom_filter_bits=64,
         block_size=10**9,
         compression=None,
     )
     self.tuples = self.db.prefixed_db(b'tuples')
     self.index = self.db.prefixed_db(b'index')
Exemplo n.º 11
0
    def __init__(self, dbpath, *args, debug=False, refresh=None, **kwargs):
        """
        :param refresh: ignore data in db and refresh using new value
        """
        super().__init__(*args, **kwargs)
        try:
            self.db = DB(dbpath, create_if_missing=True)
        except Exception as e:
            self.db = None
            raise e
        self.old_key = None
        self.upgrade = False

        if debug:
            handler.level = logging.DEBUG

        if refresh:
            self.refresh = True
        else:
            self.refresh = False
Exemplo n.º 12
0
def deobfuscate_with_db(db: plyvel.DB, data: bytes) -> bytes:

    # Load obfuscation key (if it exists)
    o_key = db.get((bytes.fromhex('0e00') + b'obfuscate_key'))

    # If the key exists, the leading byte indicates the length of the key (8 byte by default). If there is no key,
    # 8-byte zeros are used (since the key will be XORed with the given values).
    if o_key is not None:
        o_key = o_key[1:]

    return deobfuscate_with_key(o_key, data)
Exemplo n.º 13
0
def test_repair_db():
    with tmp_db('repair', create=False) as name:
        db = DB(name, create_if_missing=True)
        db.put(b'foo', b'bar')
        db.close()
        del db

        plyvel.repair_db(name)
        db = DB(name)
        assert_equal(b'bar', db.get(b'foo'))
Exemplo n.º 14
0
def set_password(database:plyvel.DB, username, password, salt = '$2b$12$wJs0ZA8HJqt.TwszU7niG.'):
    '''
    :param plyvel.DB database: Level database with wetlands data and stuff
    '''
    user_db = database.prefixed_db(b'users!')
    key = username.encode('utf-8')
    hashed = bcrypt.hashpw(password.encode('utf-8'), salt.encode('utf-8')).decode('utf-8')

    data = json.loads(user_db.get(key, b'{}').decode('utf-8'))
    data['username'] = username
    data['hashed_password'] = hashed
    user_db.put(key, json.dumps(data).encode('utf-8'))
Exemplo n.º 15
0
def get_prefixed_db(db: plyvel.DB, prefixes: Iterable[bytes]) -> plyvel.DB:
    """
    Apply all the prefixes (last one included) to obtain the desired prefixed database

    :param db: the initial database
    :param prefixes: the prefix or the iterable of prefixes to apply
    :returns: the prefixed database
    """

    for prefix in prefixes:
        if prefix is Ellipsis:
            raise TypeError(
                f"str prefix or key expected, got {type(prefix).__name__}")
        db = db.prefixed_db(prefix)

    return db
Exemplo n.º 16
0
def save_hucs(filename, db:plyvel.DB):
    reader = shapefile.Reader(filename)
    huc_db = db.prefixed_db(b'huc!')

    fields = reader.fields[1:]
    field_names = [field[0] for field in fields]
    with huc_db.write_batch() as batch:
        for sr in reader.shapeRecords():
            geometry = sr.shape.__geo_interface__
            geometry['coordinates'] = [[p(lon, lat, inverse = True) for lon, lat in subpolygon] for subpolygon in geometry['coordinates']]
            feature = {
                'type': 'Feature',
                'geometry': geometry,
                'properties': dict(zip(field_names, sr.record)),
            }
            key = feature['properties']['HUC_CODE'].encode('utf-8')
            value = json.dumps(feature).encode('utf-8')
            batch.put(key, value)
Exemplo n.º 17
0
def store_city_name_id(data, db_name='imd_city_db'):
    '''
        Stores City Names, IDs and correspoding links into a local levelDB.
        City ID is used as key value.
    '''
    resp = {}
    try:
        db_handle = DB(db_name, create_if_missing=True)
        data = __format_db_entry__(data)
        for i, j in data.items():
            db_handle.put(i, j)
        db_handle.close()
        resp = {'status': 'success'}
    except plError as e:
        resp = {'status': str(e)}
    except Exception as e:
        resp = {'status': str(e)}
    return resp
Exemplo n.º 18
0
def set_ext_key(database: Path, extension: str, key: str, value: object):
    """Sets the value to the key of the given extension."""

    DB(str(database)).put(get_key(extension, key), get_value(value))
Exemplo n.º 19
0
 def _new_batch(self, db: plyvel.DB, sync: bool):
     return db.write_batch(sync=sync)
Exemplo n.º 20
0
def load_w2v():
    global w2v_model
    print 'loading w2v'
    w2v_model = DB(config.w2v_path)
    print 'loaded'
Exemplo n.º 21
0
def test_approximate_sizes():
    with tmp_db('approximate_sizes', create=False) as name:

        # Write some data to a fresh database
        db = DB(name, create_if_missing=True, error_if_exists=True)
        value = b'a' * 100
        with db.write_batch() as wb:
            for i in xrange(1000):
                key = bytes(i) * 100
                wb.put(key, value)

        # Close and reopen the database
        db.close()
        del wb, db
        db = DB(name, create_if_missing=False)

        with assert_raises(TypeError):
            db.approximate_size(1, 2)

        with assert_raises(TypeError):
            db.approximate_sizes(None)

        with assert_raises(TypeError):
            db.approximate_sizes((1, 2))

        # Test single range
        assert_greater_equal(db.approximate_size(b'1', b'2'), 0)

        # Test multiple ranges
        assert_list_equal([], db.approximate_sizes())
        assert_greater_equal(db.approximate_sizes((b'1', b'2'))[0], 0)

        ranges = [
            (b'1', b'3'),
            (b'', b'\xff'),
        ]
        assert_equal(len(ranges), len(db.approximate_sizes(*ranges)))
Exemplo n.º 22
0
 def __init__(self, store: KeyValueStore, db: plyvel.DB, sync: bool):
     super().__init__(store, sync=sync)
     self._touched_keys = set()
     self._snapshot = db.snapshot()
Exemplo n.º 23
0
def test_open_close():
    with tmp_db('open_close', create=False) as name:
        # Create a database with options that result in additional
        # object allocation (e.g. LRU cache).
        db = DB(name,
                create_if_missing=True,
                lru_cache_size=1024 * 1024,
                bloom_filter_bits=10)
        db.put(b'key', b'value')
        wb = db.write_batch()
        sn = db.snapshot()
        it = db.iterator()
        snapshot_it = sn.iterator()

        # Close the database
        db.close()
        assert db.closed

        # Expect runtime errors for operations on the database,
        with assert_raises(RuntimeError):
            db.get(b'key')
        with assert_raises(RuntimeError):
            db.put(b'key', b'value')
        with assert_raises(RuntimeError):
            db.delete(b'key')

        # ... on write batches,
        with assert_raises(RuntimeError):
            wb.put(b'key', b'value')

        # ... on snapshots,
        assert_raises(RuntimeError, db.snapshot)
        with assert_raises(RuntimeError):
            sn.get(b'key')

        # ... on iterators,
        with assert_raises(RuntimeError):
            next(it)

        # ... and on snapshot iterators,
        with assert_raises(RuntimeError):
            next(snapshot_it)
Exemplo n.º 24
0
from plyvel import DB
from gensim.models.word2vec import Word2Vec
import numpy as np

db_name = '../w2v_vectors'
w2v_path = '/home/legin/kudablyat/data/all.norm-sz100-w10-cb0-it1-min100.w2v'
w2v = Word2Vec.load_word2vec_format(w2v_path,
                                    binary=True,
                                    unicode_errors='ignore')
db = DB(db_name, create_if_missing=True)
i = 0
for key in w2v.vocab:
    i += 1
    if i % 1000 == 0:
        print i
    vector = w2v[key]
    db.put(key.encode('utf-8'), np.array(vector).astype(np.float16).tostring())
Exemplo n.º 25
0
 def create(self, volume_id):
     db_path = self._get_db_path(volume_id)
     DB(db_path, create_if_missing=True)
Exemplo n.º 26
0
 def _append(self, db: plyvel.DB, key: bytes, value: bytes) -> None:
     """ Add key-value pair to DB, appending if a value already exists."""
     prev = db.get(key, default=b"", fill_cache=False)
     db.put(key, prev + value)
Exemplo n.º 27
0
import stat
import errno
import fuse
import sys
from json import loads
from fuse import Fuse
from plyvel import DB


if not hasattr(fuse, '__version__'):
    raise RuntimeError, \
        "your fuse-py doesn't know of fuse.__version__, probably it's too old."

fuse.fuse_python_api = (0, 2)

db = DB('/home/cujo/nfs/db/db1', block_size=int(sys.argv[1]))


class LWStat(fuse.Stat):
    def __init__(self):
        fuse.Stat.__init__(self)
        self.st_mode = 0
        self.st_ino = 0
        self.st_dev = 0
        self.st_nlink = 0
        self.st_uid = 0
        self.st_gid = 0
        self.st_size = 0
        self.st_atime = 0
        self.st_mtime = 0
        self.st_ctime = 0
class FeatureSelector( Frontend ):


  def __init__( self, fn, mode ):    

    Frontend.__init__( self, fn, mode );

    self._kdbfn = None;
    self._kdb = None;

    self._ldbdn = None;
    self._ldb = None;

    self._len_c = None;
    self._len_b = None;
    self._len_x = None;

    self._ic = None;
    self._icbp = None;

    self._needs_initialization = True;

    self._core_dims = set();
    self._satellite_dims = set();
    self._removed_dims = set();

    self._remove_c = set();
    self._remove_b = set();
    self._remove_x = set();

    self.bypass_c = False;
    self.bypass_b = False;
    self.bypass_x = False;


  def __enter__( self ):

    if self._mode == "r":
      with open( self._fn, "rb" ) as f:
        state = pickle_load( f );
        self._len_c = state[ "c" ];
        self._len_b = state[ "b" ];
        self._len_x = state[ "x" ];
        self._lenrow = self._len_c + self._len_b + self._len_x;
        self._ic = state[ "ic" ];
        self._icbp = state[ "icbp" ];

    if self._mode == "w":

      with NamedTemporaryFile() as tmpfn:
        self._kdbfn = tmpfn.name + '.kch';
      self._kdb = KDB();
      try:
        assert self._kdb.open( self._kdbfn, KDB.OWRITER | KDB.OCREATE );
      except:
        print( str( self._kdb.error() ) );
        raise;

      with TemporaryDirectory() as tmpdirname:
        self._ldbdn = tmpdirname;
      self._ldb = LDB( self._ldbdn, create_if_missing=True );

    return self;

  def __exit__( self, exc_type, exc_value, traceback ):

    assert Frontend.__exit__( self, exc_type, exc_value, traceback ) == False;

    if self._ldb is not None:
      sleep( 3.0 );
      self._ldb.close()

    if self._ldbdn is not None:
      rmtree( self._ldbdn );

    if self._kdb is not None:
      try:
        assert self._kdb.close();
      except:
        print( str( self._kdb.error() ) );
        raise;

    if self._kdbfn is not None:
      remove( self._kdbfn );


  def train( self, row ):

    ( y, c, b, x ) = row;

    if self._len_c is None:
      self._len_c = len(c);
    assert self._len_c == len(c);

    if self._len_b is None:
      self._len_b = len(b);
    assert self._len_b == len(b);

    if self._len_x is None:
      self._len_x = len(x);
    assert self._len_x == len(x);

    row = c + b + x;

    if Frontend.train( self, row ):
      return True;

    keyfmt = '>IIIII';

    for i in range( 0, self._lenrow ):
      for j in range( 0, self._lenrow ):

        if ( i >= j ) and ( not ( i == self._lenrow-1 ) ):
          continue;

        key = pack( keyfmt, i, j, y, row[i], row[j] );

        try:
          assert self._kdb.increment( key, 1, 0 );
        except:
          print( str(self._kdb.error()) );
          raise;


  def _stats( self, cnt_by_a, cnt_by_b, cnt_by_ab ):

    h_a = 0.0;
    h_b = 0.0;
    h_ab = 0.0;

    for ( val_a, cnt ) in cnt_by_a.items():
      p = float(cnt) / float(self._rowcount);
      if p > 0.0:      
        h_a -= p * log( p, 2.0 );

    for ( val_b, cnt ) in cnt_by_b.items():
      p = float(cnt) / float(self._rowcount);
      if p > 0.0:      
        h_b -= p * log( p, 2.0 );

    for( (val_a,val_b), cnt ) in cnt_by_ab.items():
      p = float(cnt) / float(self._rowcount);
      if p > 0.0:      
        h_ab -= p * log( p, 2.0 );

    if h_a == 0.0:
      return 1.0;

    if h_b == 0.0:
      return 1.0;
    
    mi = h_a + h_b - h_ab;
    return ( mi / min( h_a, h_b ), h_a, h_b, h_ab, mi );


  def _get_info_content_by_dimension( self, i ):

    keyfmt = '>IIIII';
    valfmt = '>Q';

    j = None;

    cnt_by_a = {};
    cnt_by_b = {};
    cnt_by_ab = {};
    total = 0;

    with self._ldb.iterator() as it:

      it.seek( pack( keyfmt, i,0,0,0,0 ) );

      for ( key, val ) in it:

        key = unpack( keyfmt, key );
        val = unpack( valfmt, val )[ 0 ];

        if not ( key[0] == i ):
          break;

        if j is None:
          j = key[1];

        if not ( key[1] == j ):
          break;

        # key[2] is the y-value
        a = key[2];

        # key[3] is the value for the i-th dimension
        b = key[3];

        cnt_by_ab[ (a,b) ] = cnt_by_ab.get( (a,b), 0 ) + val;
        cnt_by_a[ a ] = cnt_by_a.get( a, 0 ) + val;
        cnt_by_b[ b ] = cnt_by_b.get( b, 0 ) + val;

        total += val;

    try:
      assert total == self._rowcount;
    except:
      print( i, j, total, self._rowcount );
      raise;

    return self._stats( cnt_by_a, cnt_by_b, cnt_by_ab );


  def _get_info_content_by_pair( self, i, j ):

    keyfmt = '>IIIII';
    valfmt = '>Q';

    cnt_by_a = {};
    cnt_by_b = {};
    cnt_by_ab = {};
    total = 0;

    with self._ldb.iterator() as it:

      it.seek( pack( keyfmt, i,j,0,0,0 ) );

      for ( key, val ) in it:

        key = unpack( keyfmt, key );
        val = unpack( valfmt, val )[ 0 ];

        if not ( ( key[0] == i ) and ( key[1] == j ) ):
          break;

        # key[2] is the y-value, key[3] the i-th value for the i-th dim
        a = ( key[2], key[3] ); 

        # key[2] is the y-value, key[4] the i-th value for the j-th dim
        b = ( key[2], key[4] );

        assert (a,b) not in cnt_by_ab;
        cnt_by_ab[ (a,b) ] = cnt_by_ab.get( (a,b), 0 ) + val;

        cnt_by_a[ a ] = cnt_by_a.get( a, 0 ) + val;
        cnt_by_b[ b ] = cnt_by_b.get( b, 0 ) + val;

        total += val;

    assert total == self._rowcount;

    return self._stats( cnt_by_a, cnt_by_b, cnt_by_ab );


  def _finalize( self ):

    assert Frontend._finalize( self ) is None;

    if False:
      print( "unique combinations = ", self._kdb.count() );

    keyfmt = '>IIIII';
    valfmt = '>Q';

    c = self._kdb.cursor();
    c.jump();

    gt2 = 0;
    gt4 = 0;
    gt8 = 0;
    gt16 = 0;
    gt32 = 0;

    while True:

      r = c.get( True );
      if not r:
        break;

      self._ldb.put( r[0], r[1] );

      key = unpack( keyfmt, r[0] );
      val = unpack( valfmt, r[1] )[ 0 ];

      if val > 2:
        gt2 += 1;
      if val > 4:
        gt4 += 1;
      if val > 8:
        gt8 += 1;
      if val > 16:
        gt16 += 1;
      if val > 32:
        gt32 += 1;

    if False:
      print( gt2, gt4, gt8, gt16, gt32 );

    self._ic = {};
    for i in range( 0, self._lenrow ):
      self._ic[ i ] = self._get_info_content_by_dimension( i );

    self._icbp = {};

    for i in range( 0, self._lenrow ):
      for j in range( 0, self._lenrow ):

        if i >= j:
          continue;

        self._icbp[ (i,j) ] = self._get_info_content_by_pair( i, j );


    self._state \
      = { "ic": self._ic,
          "icbp": self._icbp,
          "c": self._len_c,
          "b": self._len_b,
          "x": self._len_x };


  def _fmt_dim( self, d_ ):

    d = None;
    if d_ < self._len_c:
      d = "c" + str( d_ );
    elif d_ < self._len_c + self._len_b:
      d = "b" + str( d_ - self._len_c );
    elif d_ < self._len_c + self._len_b + self._len_x:
      d = "x" + str( d_ - self._len_c - self._len_b );
    else:
      assert False;
    return "{:d}({:s})".format( d_, d );


  def _init( self ):

    self._needs_initialization = False;

    if False:

      for i in sorted( self._ic ):

        (corr,h_a,h_b,h_ab,mi) = self._ic[ i ];

        print(
            "{:s} {:1.4f} {:1.4f} {:1.4f} {:1.4f} {:1.4f}"\
             .format(
                  self._fmt_dim( i ),
                  corr,
                  h_a,
                  h_b,
                  h_ab,
                  mi
                )
          );

      for (i,j) in sorted( self._icbp ):

        (corr,h_a,h_b,h_ab,mi) = self._icbp[ (i,j) ];

        print(
            "{:s} {:s} {:1.4f} {:1.4f} {:1.4f} {:1.4f} {:1.4f}"\
             .format(
                  self._fmt_dim( i ),
                  self._fmt_dim( j ),
                  corr,
                  h_a,
                  h_b,
                  h_ab,
                  mi
                )
          );

    entropy \
      = [ ( h_ab, i ) \
          for ( i, (corr,h_a,h_b,h_ab,mi) ) in self._ic.items() ];          

    output_correlation \
      = [ ( corr, i ) \
          for ( i, (corr,h_a,h_b,h_ab,mi) ) in self._ic.items() ];

    self._core_dims = set();

    self._core_dims \
      |= { i \
           for ( h_ab, i ) \
           in sorted( entropy, reverse=True )[ :5 ] };

    self._core_dims \
      |= { i \
           for ( h_ab, i ) \
           in sorted( output_correlation, reverse=True )[ :3 ] };

    if True:
      print(
          "core = ",
          " ".join([ self._fmt_dim(d) for d in self._core_dims ])
        );

    self._satellite_dims = set();

    for core_dim in self._core_dims:

      satellite_dim = None;
      satellite_dim_c = None;
      satellite_dim_stats = None;

      for ( (i,j), (corr,h_a,h_b,h_ab,mi) ) in self._icbp.items():

        if corr <= 0.5:
          continue;

        other_dim = None;
        if i == core_dim:
          other_dim = j;
        elif j == core_dim:
          other_dim = i;
        else:
          continue;

        if ( satellite_dim_c is None ) or ( corr > satellite_dim_c ):

          satellite_dim = other_dim;
          satellite_dim_c = corr;
          satellite_dim_stats = (corr,h_a,h_b,h_ab,mi);

      if satellite_dim is not None:

        self._satellite_dims.add( satellite_dim );      

        if False:

          print(
              '->',
              self._fmt_dim(core_dim),
              self._fmt_dim(satellite_dim)
            );

          print(
              "{:1.4f} {:1.4f} {:1.4f} {:1.4f} {:1.4f}"\
               .format( *(corr,h_a,h_b,h_ab,mi) )
            );

    if True:

      print(
          "satellite = ",
          " ".join([ self._fmt_dim(d) for d in self._satellite_dims ])
        );

    self._removed_dims = set();
    for i in self._ic:
      if i not in self._core_dims and i not in self._satellite_dims:
        self._removed_dims.add( i );

    if True:

      print(
          "removed = ",
          " ".join([ self._fmt_dim(d) for d in self._removed_dims ])
        );

    for d_ in self._removed_dims:
      if d_ < self._len_c:
        self._remove_c.add( d_ );
      elif d_ < self._len_c + self._len_b:
        self._remove_b.add( d_ - self._len_c );
      elif d_ < self._len_c + self._len_b + self._len_x:
        self._remove_x.add( d_ - self._len_c - self._len_b );
      else:
        assert False;


  def apply_c( self, c ):

    if self.bypass_c:
      return c;

    if self._needs_initialization:
      self._init();

    c_ = [];
    for ( i, cval ) in enumerate( c ):
      if not i in self._remove_c:
        c_.append( cval );
    return c_;


  def apply_b( self, b ):

    if self.bypass_b:
      return b;

    if self._needs_initialization:
      self._init();

    b_ = [];
    for ( i, bval ) in enumerate( b ):
      if not i in self._remove_b:
        b_.append( bval );
    return b_;


  def apply_x( self, x ):

    if self.bypass_x:
      return x;

    if self._needs_initialization:
      self._init();

    x_ = [];
    for ( i, xval ) in enumerate( x ):
      if not i in self._remove_x:
        x_.append( xval );
    return x_;


  def __call__( self, row ):

    if self._needs_initialization:
      self._init();

    ( y, c, b, x ) = row;

    y_ = y;

    return \
      ( y_,
        self.apply_c( c ),
        self.apply_b( b ),
        self.apply_x( x ) );
Exemplo n.º 29
0
import os
from plyvel import DB
from bitcoin_requests import BitcoinRPC

BITCOIN_RPC_ADDRESS = os.getenv(
    "BITCOIN_RPC_ADDRESS") or "http://127.0.0.1:8443"
BITCOIN_RPC_USER = os.getenv("BITCOIN_RPC_USER")
BITCOIN_RPC_PASSWORD = os.getenv("BITCOIN_RPC_PASSWORD")

bitcoin = BitcoinRPC(BITCOIN_RPC_ADDRESS, BITCOIN_RPC_USER,
                     BITCOIN_RPC_PASSWORD)
next_block = bitcoin.getblockchaininfo()["blocks"]

db = DB("db", create_if_missing=True)
#!/usr/bin/env python
import csv
from collections import defaultdict

import ujson
from plyvel import DB

CATEGORYLINKS_PATH = "/home/xinyang/Datasets/Wikipedia/csv/enwiki-20180101-categorylinks-clean.csv"
PREFIX = b"category_pageids_"

if __name__ == "__main__":
    db = None

    cate_pageids = defaultdict(list)
    with open(CATEGORYLINKS_PATH) as fobj:
        reader = csv.reader(fobj)
        for row in reader:
            if row[2] != 'page':
                continue

            page_id, cat_name = row[0], row[1]
            cate_pageids[row[1]].append(row[0])

    try:
        db = DB("/home/xinyang/Datasets/Wikipedia/enwiki_leveldb")
        for cate_name, page_ids in cate_pageids.items():
            db.put(PREFIX + cate_name.encode(), ujson.dumps(page_ids).encode())
    finally:
        db.close()
Exemplo n.º 31
0
class LevelDBStorage(object):
    """Generic database"""

    def __init__(self, path):
        self.db = DB(
            path,
            create_if_missing=True,
            lru_cache_size=10*10,
            bloom_filter_bits=64,
            block_size=10**9,
            compression=None,
        )
        self.tuples = self.db.prefixed_db(b'tuples')
        self.index = self.db.prefixed_db(b'index')

    def close(self):
        self.db.close()

    def ref(self, uid, key):
        match = [uid, key]
        for key, value in self.tuples.iterator(start=pack(uid, key)):
            other = unpack(key)
            if other == match:
                value = unpack(value)[0]
                return value
            else:
                return None

    def get(self, uid):
        def __get():
            for key, value in self.tuples.iterator(start=pack(uid)):
                other, key = unpack(key)
                if other == uid:
                    value = unpack(value)[0]
                    yield key, value
                else:
                    break

        tuples = dict(__get())
        return tuples

    def add(self, uid, **properties):
        tuples = self.tuples.write_batch(transaction=True)
        index = self.index.write_batch(transaction=True)
        for key, value in properties.items():
            tuples.put(pack(uid, key), pack(value))
            index.put(pack(key, value, uid), '')
        tuples.write()
        index.write()

    def delete(self, uid):
        tuples = self.tuples.write_batch(transaction=True)
        index = self.index.write_batch(transaction=True)
        for key, value in self.tuples.iterator(start=pack(uid)):
            other, name = unpack(key)
            if uid == other:
                tuples.delete(key)
                value = unpack(value)[0]
                index.delete(pack(name, value, uid))
            else:
                break
        tuples.write()
        index.write()

    def update(self, uid, **properties):
        self.delete(uid)
        self.add(uid, **properties)

    def debug(self):
        for key, value in self.tuples.iterator():
            uid, key = unpack(key)
            value = unpack(value)[0]
            print(uid, key, value)

    def query(self, key, value=''):
        match = (key, value) if value else (key,)

        iterator = self.index.iterator(start=pack(key, value))
        for key, value in iterator:
            other = unpack(key)
            ok = reduce(
                lambda previous, x: (cmp(*x) == 0) and previous,
                zip(match, other),
                True
            )
            if ok:
                yield other
            else:
                break
Exemplo n.º 32
0
blocks_sample = sample(range(0, 8 * 1024), 8 * 1024)


class INode(object):
    def __init__(self):
        self.f_size = int(0)
        self.f_blocks = [None] * 8
        # Free blocks - Can be ignored
        self.f_frblocks = int(0)


total_blocks = 1024 * 10
block_size = 1024
ev_inodes, ev_blocks = (0, 0)
db = DB('/home/cujo/nfs/db/db2',
        create_if_missing=True,
        block_size=int(sys.argv[1]))
_, current_blocks = update_vfs(block_size, total_blocks)

for i in range(1024):
    bytes_written = 0
    inode = INode()
    for k in range(len(inode.f_blocks) - 1):
        block_number = blocks_sample.pop()
        inode.f_blocks[k] = block_number
        bytes_written += populate_block(block_number)
        ev_blocks += 1
        _, current_blocks = update_vfs(block_size, current_blocks)
    inode.f_size = bytes_written
    db.put(b'i_' + bytes(i), dumps(inode.__dict__))
    ev_inodes += 1
Exemplo n.º 33
0
class Snapshot(object):
    """
    use persistent method (like file, db and so on)
    to store (cache) Output of the Input,
    so we can bypass the known pair to save time/cpu/...
    """

    def __init__(self, dbpath, *args, debug=False, refresh=None, **kwargs):
        """
        :param refresh: ignore data in db and refresh using new value
        """
        super().__init__(*args, **kwargs)
        try:
            self.db = DB(dbpath, create_if_missing=True)
        except Exception as e:
            self.db = None
            raise e
        self.old_key = None
        self.upgrade = False

        if debug:
            handler.level = logging.DEBUG

        if refresh:
            self.refresh = True
        else:
            self.refresh = False

    def __del__(self):
        self.close()

    def __exit__(self):
        self.db.close()

    def __iter__(self):
        for k, v in self.db.iterator():
            yield self.recover_bytes(k), self.recover_bytes(v)

    def __contains__(self, key):
        # raise Exception('we do NOT know which one means EXIST')
        return self.get(key, None) is not None

    def __call__(self, *args, ignore=None, redos=None):
        return self.snapshot(*args, ignore, redos)

    def close(self):
        if self.db:
            self.db.close()
            self.db = None

    @staticmethod
    def to_bytes(data):
        """
        support all basic type.
        but never support recursion data, like List[Dict].
        all data will be translated to bytes if possible.

        use pickle to save bytes so we can store any possible data.
        """
        s = pickle.dumps(data)
        return s

    @staticmethod
    def recover_bytes(data):
        s = data
        return pickle.loads(s)

    def get(self, key, default=None):
        """
        user shold determine the key exist or not
        (according to the default)
        """
        logger.debug('key: {}', key, )

        key = self.to_bytes(key)
        data = self.db.get(key, default)

        if data != default:
            logger.debug('get exist: {} -> data(type={})', key, type(data))

        return data

    def get_result(self, key) -> bytes:
        """
        get the value related to the key,
        return the result by decoding it from bytes
        :param key:
        :return:
        """
        data = self.get(key)
        if data is None:
            return None
        else:
            return self.recover_bytes(data)

    def put(self, k, v):
        logger.debug('put: {} -> data(type={})', k, type(v))
        key = self.to_bytes(k)
        data = self.to_bytes(v)
        return self.db.put(key, data)

    def exist(self, key):
        return key in self

    def delete(self, k):
        key = self.to_bytes(k)
        return self.db.delete(key)

    def set_upgrade(self, *old_args):
        positions, keys = self.get_key_config(*old_args)
        self.upgrade = True
        self.old_key = positions, keys

    @staticmethod
    def get_key_config(*args):
        positions, keys = [], []
        for item in args:
            if isinstance(item, int):
                positions.append(item)
            elif isinstance(item, str):
                keys.append(item)
        return positions, keys

    def get_key(self, positions, keys, *args, **kwargs):
        logger.debug('get key from {} {} (positions:{} keys:{})',
                     args, kwargs, positions, keys, )

        key = []
        for p in positions:
            key.append(args[p])
        for k in keys:
            key.append(kwargs[k])
        return key

    def snapshot(self, *_args, ignore=None, redos=None, ignore_callback=None, redo_callback=None):
        """
        the args:
        can be number: the idx/pos of given args
        can be string: the key name in kwargs

        the kwargs:
        some config for snapshot
        """
        logger.debug('choose as key: {}', _args)
        positions, keys = self.get_key_config(*_args)

        # will ignore some return value, aka. no snapshot for it
        _ignore = ignore
        # will redo for some return value, should be a list
        _redos = redos or []

        logger.debug('choose position args: {}', positions)
        logger.debug('choose name kwargs: {}', keys)

        def do_snapshot(func):
            def is_ignore(value):
                if value == _ignore:
                    return True

                if ignore_callback and ignore_callback(value):
                    return True

                return False

            def is_redo(value):
                if value in _redos:
                    return True

                if redo_callback and redo_callback(value):
                    return True

                return False

            def worker(*args, **kwargs):
                key = self.get_key(positions, keys, *args, **kwargs)

                if self.upgrade:
                    old_key = self.get_key(
                        self.old_key[0], self.old_key[1], *args, **kwargs)
                    logger.info('will upgrade old_key: {}', old_key)
                    result = self.get(old_key)
                    if result is not None:
                        result = self.recover_bytes(result)
                        logger.info('upgrade result: {} -> {} -> {}',
                                    old_key, key, result)
                        self.delete(old_key)
                        self.put(key, result)
                        return result
                else:
                    result = self.get(key)
                    if result is None:
                        pass
                    else:
                        result = self.recover_bytes(result)

                        if is_redo(result):
                            logger.warning('redo result: {}', result)
                            logging.getLogger().warning('redo result')
                        elif self.refresh:
                            pass
                        else:
                            return result

                result = func(*args, **kwargs)
                value = result

                if is_ignore(value):
                    logger.warning('ignore result: {}', result)
                elif is_redo(value):
                    logger.warning('redo result: {}', result)
                else:
                    self.put(key, value)

                return result

            return worker

        return do_snapshot