def __init__( self, fn, mode ): Frontend.__init__( self, fn, mode ); self._kdbfn = None; self._kdb = None; self._ldbdn = None; self._ldb = None; self._len_c = None; self._len_b = None; self._len_x = None; self._ic = None; self._icbp = None; self._needs_initialization = True; self._core_dims = set(); self._satellite_dims = set(); self._removed_dims = set(); self._remove_c = set(); self._remove_b = set(); self._remove_x = set(); self.bypass_c = False; self.bypass_b = False; self.bypass_x = False;
def __init__( self, fn, mode ): Frontend.__init__( self, fn, mode ); self._max_rows = 50000; self._dbs = {}; self._dbdirs = [];
def _finalize( self ): assert Frontend._finalize( self ) is None; rowids_for_ntile_bounds = []; for i in range( 1, 32 ): rowids_for_ntile_bounds.append( int( float(self._rowcount) * float(i)/32.0 ) ); ntile_bound_by_dim = {}; for i in self._dbs: ntile_bound_by_dim[ i ] = []; for ( i, db ) in self._dbs.items(): with db.iterator() as it: rowid = 0; for ( valid, ( xval, xcnt ) ) in enumerate( it ): xval = unpack( '>I', xval )[ 0 ] - (1<<31); xcnt = unpack( '>I', xcnt )[ 0 ]; rowid_old = rowid; rowid = rowid_old + xcnt; for bound in rowids_for_ntile_bounds: if rowid_old < bound <= rowid: ntile_bound_by_dim[ i ].append( xval ); self._state = ntile_bound_by_dim;
def train( self, row ): if Frontend.train( self, row ): return True; for ( i, xval ) in enumerate( row ): if not i in self._dbs: dbdn = None; with TemporaryDirectory() as tmpdirname: dbdn = tmpdirname; self._dbdirs.append( dbdn ); self._dbs[ i ] = plyvel.DB( dbdn, create_if_missing=True ); xval \ = int( float(xval) * 1000.0 ); assert xval < (1<<30); xval \ = pack( ">I", (1<<31) + xval ); xcnt \ = unpack( ">I", self._dbs[ i ].get( xval, default=pack( ">I", 0 ) ) )[ 0 ]; xcnt += 1; self._dbs[ i ].put( xval, pack( ">I", xcnt ) ); return False;
def train( self, row ): if Frontend.train( self, row ): return True; self._data.append( row ); return False;
def __exit__( self, exc_type, exc_value, traceback ): assert Frontend.__exit__( self, exc_type, exc_value, traceback ) == False; sleep( 3.0 ); for db in self._dbs.values(): db.close(); for dn in self._dbdirs: rmtree( dn ); return False;
def train( self, row ): if Frontend.train( self, row ): return True; b = 0; for i in range( 0, len(row) ): if row[i] == 1: b |= (1<<i); self._stats[ b ] = self._stats.get( b, 0 ) + 1; return False;
def _finalize( self ): assert Frontend._finalize( self ) is None; data = np.array( self._data ).T; rest = []; for i in range( 1, len(data)+1 ): rest.append( {i} ); while True: len_rest_before = len(rest); rest = self._merge( data, rest ); # print( rest ); if len( rest ) == len_rest_before: break; self._state = rest; if False: print( "-- CLUSTERS --" ); print( len(rest), rest ); if False: print( "-- INSIDE CORRELATIONS --" ); for dc in rest: print( dc ); for di in dc: for dj in dc: if abs(di) >= abs(dj): continue; corr = np.corrcoef( data[abs(di)-1], data[abs(dj)-1] )[ 0 ][ 1 ]; print( di, dj, corr ); if False: print( "-- OUTSIDE CORRELATIONS --" ); for ( i, dci ) in enumerate( rest ): ivals = self._cluster_val( data, dci ); for ( j, dcj ) in enumerate( rest ): if i >= j: continue; jvals = self._cluster_val( data, dcj ); corr = np.corrcoef( ivals, jvals )[ 0 ][ 1 ]; print( i, j, corr );
def __call__( self, row ): assert Frontend.__call__( self, row ) is row; val = 0; for (i,row_i) in enumerate( row ): if row_i == 1: val |= (1<<i); row_ = []; for cluster in self._state: mask = 0; for dim in cluster: mask |= (1<<dim); row_.append( val & mask ); return row_;
def __exit__( self, exc_type, exc_value, traceback ): assert Frontend.__exit__( self, exc_type, exc_value, traceback ) == False; if self._ldb is not None: sleep( 3.0 ); self._ldb.close() if self._ldbdn is not None: rmtree( self._ldbdn ); if self._kdb is not None: try: assert self._kdb.close(); except: print( str( self._kdb.error() ) ); raise; if self._kdbfn is not None: remove( self._kdbfn );
def __call__( self, row ): assert Frontend.__call__( self, row ) is row; row_ = []; for cluster in self._state: val = 0.0; for dim in cluster: if dim > 0: val += row[dim]; else: assert dim < 0; val -= row[dim]; val /= len(cluster); row_.append( val ); return row_;
def train( self, row ): ( y, c, b, x ) = row; if self._len_c is None: self._len_c = len(c); assert self._len_c == len(c); if self._len_b is None: self._len_b = len(b); assert self._len_b == len(b); if self._len_x is None: self._len_x = len(x); assert self._len_x == len(x); row = c + b + x; if Frontend.train( self, row ): return True; keyfmt = '>IIIII'; for i in range( 0, self._lenrow ): for j in range( 0, self._lenrow ): if ( i >= j ) and ( not ( i == self._lenrow-1 ) ): continue; key = pack( keyfmt, i, j, y, row[i], row[j] ); try: assert self._kdb.increment( key, 1, 0 ); except: print( str(self._kdb.error()) ); raise;
def _finalize( self ): assert Frontend._finalize( self ) is None; self._state = []; rest = set( range( 0, self._lenrow ) ); while rest: if len( rest ) >= 2: ( left, right, rest ) = self._split( rest ); self._state.append( left ); self._state.append( right ); else: self._state.append( rest ); rest = set(); if False: for ( cluster_id, cluster ) in enumerate( self._state ): print( "-->", cluster_id, cluster ); for i in cluster: a = { i }; b = cluster - a; print( " {:d} {:1.4f}".format( i, self._i_corr( a, b ) ) );
def _finalize( self ): assert Frontend._finalize( self ) is None; if False: print( "unique combinations = ", self._kdb.count() ); keyfmt = '>IIIII'; valfmt = '>Q'; c = self._kdb.cursor(); c.jump(); gt2 = 0; gt4 = 0; gt8 = 0; gt16 = 0; gt32 = 0; while True: r = c.get( True ); if not r: break; self._ldb.put( r[0], r[1] ); key = unpack( keyfmt, r[0] ); val = unpack( valfmt, r[1] )[ 0 ]; if val > 2: gt2 += 1; if val > 4: gt4 += 1; if val > 8: gt8 += 1; if val > 16: gt16 += 1; if val > 32: gt32 += 1; if False: print( gt2, gt4, gt8, gt16, gt32 ); self._ic = {}; for i in range( 0, self._lenrow ): self._ic[ i ] = self._get_info_content_by_dimension( i ); self._icbp = {}; for i in range( 0, self._lenrow ): for j in range( 0, self._lenrow ): if i >= j: continue; self._icbp[ (i,j) ] = self._get_info_content_by_pair( i, j ); self._state \ = { "ic": self._ic, "icbp": self._icbp, "c": self._len_c, "b": self._len_b, "x": self._len_x };
def __init__( self, fn, mode ): Frontend.__init__( self, fn, mode ); self._max_rows = 100000; self._stats = {};
def __init__( self, fn, mode ): Frontend.__init__( self, fn, mode ); self._max_rows = 100000; self._data = [];