class ClientCassandra(): def __init__(self,keySpace): self.pool = ConnectionPool(keySpace, ['localhost:9160']) self.col_fam_page = ColumnFamily(self.pool, 'Page') self.col_fam_publication = ColumnFamily(self.pool, 'Publication') self.col_fam_company = ColumnFamily(self.pool, 'Company') self.col_fam_location = ColumnFamily(self.pool, 'Location') self.col_fam_category = ColumnFamily(self.pool, 'Category') #pycassaShell #SYSTEM_MANAGER.create_keyspace('BlwData', strategy_options={"replication_factor": "1"}); #SYSTEM_MANAGER.create_column_family('BlwData', 'Page'); #SYSTEM_MANAGER.create_column_family('BlwData', 'Publication'); #SYSTEM_MANAGER.create_column_family('BlwData', 'Company'); #SYSTEM_MANAGER.create_column_family('BlwData', 'Location'); #SYSTEM_MANAGER.create_column_family('BlwData', 'Category'); def insertPage(self,page): timestamp= self.col_fam_page.insert(page.getUrl(), page.toJson()) print "sizeof category " + page.category.name + " is " + str(self.col_fam_category.get_count(page.category.name)) self.col_fam_category.insert(page.category.name,{'url': page.getUrl()}) print "sizeof category " + page.category.name + " is " + str(self.col_fam_category.get_count(page.category.name)) return timestamp # should raise an exception for an immutable sequence #self.assertRaises(TypeError, random.shuffle, (1,2,3)) def getPages(self,url,column): readData = self.col_fam_page .get(url,columns=[column]) return readData def getCountCategory(self,category): return self.col_fam_category.get_count(category)
def _check_cassandra(self, del_network_keyname, local_hostname, cassandra_listen_port): from pycassa.pool import ConnectionPool from pycassa.columnfamily import ColumnFamily pool1=ConnectionPool('config_db_uuid', [local_hostname+":"+cassandra_listen_port]) col_fam=ColumnFamily(pool1, 'obj_fq_name_table') return col_fam.get_count('virtual_network', columns=[del_network_keyname])
def GetValueCount(pool, columnFamily, key, *args, **kwargs): d = None try: col_fam = ColumnFamily(pool, columnFamily) d = col_fam.get_count(key, *args, **kwargs) except Exception,e: #print('empty column '+key) pass
class DailyTemporalBloomFilter(DailyTemporalBase): """Long Range Temporal BloomFilter using a daily resolution. For really high value of expiration (like 60 days) with low requirement on precision. The actual error of this BF will the be native error of the BF + the error related to the coarse aspect of the expiration, since we no longer expires information precisely. Also, as opposed to a classic Bloom Filter, this one will aslo have false positive (reporting membership for a non-member) AND false negative (reporting non-membership for a member). The upper bound of the temporal_error can be theoricaly quite high. However, if the items of the set are uniformly distributed over time, the avg error will be something like 1.0 / expiration """ def __new__(cls, capacity, error_rate, expiration, name, cassandra_session, snapshot_path='./'): return super(DailyTemporalBloomFilter, cls).__new__(cls, capacity=capacity, error_rate=error_rate) def __init__(self, capacity, error_rate, expiration, name, cassandra_session, snapshot_path='./'): filename = "" super(DailyTemporalBloomFilter, self).__init__(capacity=capacity, error_rate=error_rate) self.bf_name = name self.expiration = expiration self.initialize_period() self.cassandra_session = cassandra_session self.cassandra_columns_family = "temporal_bf" self.keyspace = 'parsely' self.uncommited_keys = [] self.commit_batch = 1000 self.columnfamily = None self.ensure_cassandra_cf() self.snapshot_path = snapshot_path def ensure_cassandra_cf(self): s = SystemManager() if self.keyspace not in s.list_keyspaces(): s.create_keyspace(self.keyspace, SIMPLE_STRATEGY, {'replication_factor': '1'}) if self.cassandra_columns_family not in s.get_keyspace_column_families(self.keyspace): s.create_column_family(self.keyspace, self.cassandra_columns_family) self.columnfamily = ColumnFamily(self.cassandra_session, self.cassandra_columns_family) def archive_bf_key(self, bf_key): self.uncommited_keys.append(bf_key) if len(self.uncommited_keys) >= self.commit_batch: current_period_hour = dt.datetime.now().strftime('%Y-%m-%d:%H') self.columnfamily.insert('%s_%s' % (self.bf_name, current_period_hour), {k:'' for k in self.uncommited_keys}) self.uncommited_keys = [] def _hour_range(self, start, end, reverse=False, inclusive=True): """Generator that gives us all the hours between a start and end datetime (inclusive).""" def total_seconds(td): return (td.microseconds + (td.seconds + td.days * 24.0 * 3600.0) * 10.0**6) / 10.0**6 hours = int(math.ceil(total_seconds(end - start) / (60.0 * 60.0))) if inclusive: hours += 1 for i in xrange(hours): if reverse: yield end - dt.timedelta(hours=i) else: yield start + dt.timedelta(hours=i) def _day_range(self, start, end, reverse=False, inclusive=True): """Generator that gives us all the days between a start and end datetime (inclusive).""" days = (end - start).days if inclusive: days += 1 for i in xrange(days): if reverse: yield end - dt.timedelta(days=i) else: yield start + dt.timedelta(days=i) def _drop_archive(self): last_period = self.current_period - dt.timedelta(days=self.expiration-1) hours = self._hour_range(last_period, dt.datetime.now()) for hour in hours: try: row = "%s_%s" % (self.bf_name, hour.strftime('%Y-%m-%d:%H')) nbr_keys = self.columnfamily.get_count(row) keys = self.columnfamily.remove(row) except: pass def rebuild_from_archive(self, rebuild_snapshot=True): """Rebuild the BF using the archived items""" self.initialize_bitarray() #if rebuild_snapshot: # self.delete_snapshots() def multi_rows_itr(rows): for row in rows.values(): for k in row.keys(): yield k last_period = self.current_period - dt.timedelta(days=self.expiration-1) hours = self._hour_range(last_period, dt.datetime.now()) days = self._day_range(last_period, dt.datetime.now()) rows = [] for i,day in enumerate(days): rows = ["%s_%s:%s" % (self.bf_name, day.strftime('%Y-%m-%d'), hour_str) for hour_str in ["%02d" % i for i in range(24)]] rows_content = self.columnfamily.multiget(rows, column_count=1E6) update_current = day == self.current_period for k in multi_rows_itr(rows_content): self.add_rebuild(k, update_current) if rebuild_snapshot: self.save_snaphot(override_period=day) if not update_current: self.initialize_current_day_bitarray() def restore_from_disk(self, clean_old_snapshot=False): """Restore the state of the BF using previous snapshots. :clean_old_snapshot: Delete the old snapshot on the disk (period < current - expiration) """ base_filename = "%s/%s_%s_*.dat" % (self.snapshot_path, self.bf_name, self.expiration) availables_snapshots = glob.glob(base_filename) last_period = self.current_period - dt.timedelta(days=self.expiration-1) for filename in availables_snapshots: snapshot_period = dt.datetime.strptime(filename.split('_')[-1].strip('.dat'), "%Y-%m-%d") if snapshot_period < last_period and not clean_old_snapshot: continue else: self._union_bf_from_file(filename) if snapshot_period == self.current_period: self._union_bf_from_file(filename, current=True) if snapshot_period < last_period and clean_old_snapshot: os.remove(filename) self.ready = True def add_rebuild(self, key, update_current=True): super(DailyTemporalBloomFilter, self).add(key, update_current) def add(self, key_string): if isinstance(key_string, unicode): key = key_string.encode('utf8') else: key = key_string self.archive_bf_key(key) result = super(DailyTemporalBloomFilter, self).add(key) return result def resize(self, new_capacity=None, new_error_rate=None): self._set_capacity(new_capacity or self.capacity) self._set_error_rate(new_error_rate or self.error_rate) self._initialize_parameters() self.initialize_bitarray() self.rebuild_from_archive(rebuild_snapshot=True) def initialize_period(self, period=None): """Initialize the period of BF. :period: datetime.datetime for setting the period explicity. """ if not period: self.current_period = dt.datetime.now() else: self.current_period = period self.current_period = dt.datetime(self.current_period.year, self.current_period.month, self.current_period.day) self.date = self.current_period.strftime("%Y-%m-%d") def save_snaphot(self, override_period=None): """Save the current state of the current day bitarray on disk. Save the internal representation (bitarray) into a binary file using this format: filename : name_expiration_2013-01-01.dat """ period = override_period or self.current_period filename = "%s/%s_%s_%s.dat" % (self.snapshot_path, self.bf_name, self.expiration, period.strftime("%Y-%m-%d")) self._save_snapshot(filename)
def get_count(columnFamily, uid): "get number of columns in a row" column = ColumnFamily(pool, columnFamily) count = column.get_count(uid) print uid, count return count
class DailyTemporalBloomFilter(DailyTemporalBase): """Long Range Temporal BloomFilter using a daily resolution. For really high value of expiration (like 60 days) with low requirement on precision. The actual error of this BF will the be native error of the BF + the error related to the coarse aspect of the expiration, since we no longer expires information precisely. Also, as opposed to a classic Bloom Filter, this one will aslo have false positive (reporting membership for a non-member) AND false negative (reporting non-membership for a member). The upper bound of the temporal_error can be theoricaly quite high. However, if the items of the set are uniformly distributed over time, the avg error will be something like 1.0 / expiration """ def __new__(cls, capacity, error_rate, expiration, name, cassandra_session, snapshot_path='./'): return super(DailyTemporalBloomFilter, cls).__new__(cls, capacity=capacity, error_rate=error_rate) def __init__(self, capacity, error_rate, expiration, name, cassandra_session, snapshot_path='./'): filename = "" super(DailyTemporalBloomFilter, self).__init__(capacity=capacity, error_rate=error_rate) self.bf_name = name self.expiration = expiration self.initialize_period() self.cassandra_session = cassandra_session self.cassandra_columns_family = "temporal_bf" self.keyspace = 'parsely' self.uncommited_keys = [] self.commit_batch = 1000 self.columnfamily = None self.ensure_cassandra_cf() self.snapshot_path = snapshot_path def ensure_cassandra_cf(self): s = SystemManager() if self.keyspace not in s.list_keyspaces(): s.create_keyspace(self.keyspace, SIMPLE_STRATEGY, {'replication_factor': '1'}) if self.cassandra_columns_family not in s.get_keyspace_column_families( self.keyspace): s.create_column_family(self.keyspace, self.cassandra_columns_family) self.columnfamily = ColumnFamily(self.cassandra_session, self.cassandra_columns_family) def archive_bf_key(self, bf_key): self.uncommited_keys.append(bf_key) if len(self.uncommited_keys) >= self.commit_batch: current_period_hour = dt.datetime.now().strftime('%Y-%m-%d:%H') self.columnfamily.insert( '%s_%s' % (self.bf_name, current_period_hour), {k: '' for k in self.uncommited_keys}) self.uncommited_keys = [] def _hour_range(self, start, end, reverse=False, inclusive=True): """Generator that gives us all the hours between a start and end datetime (inclusive).""" def total_seconds(td): return (td.microseconds + (td.seconds + td.days * 24.0 * 3600.0) * 10.0**6) / 10.0**6 hours = int(math.ceil(total_seconds(end - start) / (60.0 * 60.0))) if inclusive: hours += 1 for i in xrange(hours): if reverse: yield end - dt.timedelta(hours=i) else: yield start + dt.timedelta(hours=i) def _day_range(self, start, end, reverse=False, inclusive=True): """Generator that gives us all the days between a start and end datetime (inclusive).""" days = (end - start).days if inclusive: days += 1 for i in xrange(days): if reverse: yield end - dt.timedelta(days=i) else: yield start + dt.timedelta(days=i) def _drop_archive(self): last_period = self.current_period - dt.timedelta(days=self.expiration - 1) hours = self._hour_range(last_period, dt.datetime.now()) for hour in hours: try: row = "%s_%s" % (self.bf_name, hour.strftime('%Y-%m-%d:%H')) nbr_keys = self.columnfamily.get_count(row) keys = self.columnfamily.remove(row) except: pass def rebuild_from_archive(self, rebuild_snapshot=True): """Rebuild the BF using the archived items""" self.initialize_bitarray() #if rebuild_snapshot: # self.delete_snapshots() def multi_rows_itr(rows): for row in rows.values(): for k in row.keys(): yield k last_period = self.current_period - dt.timedelta(days=self.expiration - 1) hours = self._hour_range(last_period, dt.datetime.now()) days = self._day_range(last_period, dt.datetime.now()) rows = [] for i, day in enumerate(days): rows = [ "%s_%s:%s" % (self.bf_name, day.strftime('%Y-%m-%d'), hour_str) for hour_str in ["%02d" % i for i in range(24)] ] rows_content = self.columnfamily.multiget(rows, column_count=1E6) update_current = day == self.current_period for k in multi_rows_itr(rows_content): self.add_rebuild(k, update_current) if rebuild_snapshot: self.save_snaphot(override_period=day) if not update_current: self.initialize_current_day_bitarray() def restore_from_disk(self, clean_old_snapshot=False): """Restore the state of the BF using previous snapshots. :clean_old_snapshot: Delete the old snapshot on the disk (period < current - expiration) """ base_filename = "%s/%s_%s_*.dat" % (self.snapshot_path, self.bf_name, self.expiration) availables_snapshots = glob.glob(base_filename) last_period = self.current_period - dt.timedelta(days=self.expiration - 1) for filename in availables_snapshots: snapshot_period = dt.datetime.strptime( filename.split('_')[-1].strip('.dat'), "%Y-%m-%d") if snapshot_period < last_period and not clean_old_snapshot: continue else: self._union_bf_from_file(filename) if snapshot_period == self.current_period: self._union_bf_from_file(filename, current=True) if snapshot_period < last_period and clean_old_snapshot: os.remove(filename) self.ready = True def add_rebuild(self, key, update_current=True): super(DailyTemporalBloomFilter, self).add(key, update_current) def add(self, key_string): if isinstance(key_string, unicode): key = key_string.encode('utf8') else: key = key_string self.archive_bf_key(key) result = super(DailyTemporalBloomFilter, self).add(key) return result def resize(self, new_capacity=None, new_error_rate=None): self._set_capacity(new_capacity or self.capacity) self._set_error_rate(new_error_rate or self.error_rate) self._initialize_parameters() self.initialize_bitarray() self.rebuild_from_archive(rebuild_snapshot=True) def initialize_period(self, period=None): """Initialize the period of BF. :period: datetime.datetime for setting the period explicity. """ if not period: self.current_period = dt.datetime.now() else: self.current_period = period self.current_period = dt.datetime(self.current_period.year, self.current_period.month, self.current_period.day) self.date = self.current_period.strftime("%Y-%m-%d") def save_snaphot(self, override_period=None): """Save the current state of the current day bitarray on disk. Save the internal representation (bitarray) into a binary file using this format: filename : name_expiration_2013-01-01.dat """ period = override_period or self.current_period filename = "%s/%s_%s_%s.dat" % (self.snapshot_path, self.bf_name, self.expiration, period.strftime("%Y-%m-%d")) self._save_snapshot(filename)
from pycassa.pool import ConnectionPool from pycassa.columnfamilymap import ColumnFamilyMap from pycassa.columnfamily import ColumnFamily if __name__ == '__main__': #['10.15.62.100:9160','10.15.62.101:9160','10.15.62.102:9160'] pool = ConnectionPool('Cassandra_Test',['10.107.4.187:9160']) print pool # cf_map = ColumnFamilyMap(User, pool, 'Users') col_fam = ColumnFamily(pool, 'Users') print col_fam.get('author') print col_fam.get_count('author') col_fam.insert('row_key', {'col_name': 'col_val'}) col_fam.insert('row_key', {'col_name':'col_val', 'col_name2':'col_val2'}) col_fam.batch_insert({'row1': {'name1': 'val1', 'name2': 'val2'},'row2': {'foo': 'bar'}}) #col_fam.insert('super_key', {'key':{'col_name':'col_val', 'col_name2':'col_val2'}}) print col_fam.get_count('row_key', columns=['foo', 'bar']) print col_fam.get_count('row_key', column_start='foo') print col_fam.multiget_count(['fib0', 'fib1', 'fib2', 'fib3', 'fib4']) print col_fam.multiget_count(['fib0', 'fib1', 'fib2', 'fib3', 'fib4'],columns=['col1', 'col2', 'col3']) print col_fam.multiget_count(['fib0', 'fib1', 'fib2', 'fib3', 'fib4'],column_start='col1', column_finish='col3') print col_fam.get_count('row_key') print col_fam.get('row_key') print col_fam.get('author') print col_fam.get('row_key', columns=['col_name', 'col_name2']) print col_fam.get('row_key', column_reversed=True, column_count=3) print col_fam.multiget(['row1', 'row2']) for i in range(1, 10): col_fam.insert('row_key', {str(i): 'val'}) print col_fam.get('row_key', column_start='5', column_finish='7')
class CassandraDemo(object): def __init__(self, database, table): self.database = database self.table = table def create_connections(self): self.pool = ConnectionPool(self.database) self.cf = ColumnFamily(self.pool, self.table) def create_database_and_table(self): super_cf = False # consider super columns to be deprecated s = SystemManager() # create keyspace if it doesn't exist if database not in s.list_keyspaces(): s.create_keyspace(database, SIMPLE_STRATEGY, {'replication_factor': '1'}) # delete column family from the keyspace if it does exist. if table in s.get_keyspace_column_families(database): s.drop_column_family(database, table) # create coulmn family in the keyspace if table not in s.get_keyspace_column_families(database): print("table is creating...") s.create_column_family(database, table, super = super_cf, comparator_type = ASCII_TYPE) s.close() return True def insert_data(self): print '\nemployee data is inserting...' self.cf.insert('1', {'fn':'yogesh', 'ln':'kumar', 'ct': 'Ajmer', 'em': '*****@*****.**'}) self.cf.insert('2', {'fn':'amit', 'ln':'pandita', 'ct': 'Delhi', 'em': '*****@*****.**'}) self.cf.insert('3', {'fn':'sandeep', 'ln':'tak', 'ct': 'Ajmer', 'em': '*****@*****.**', 'mb': '8890467032'}) def get_data(self): print '\nemployee data is featching...' data1 = self.cf.get('1') data2 = self.cf.get('2', columns = ['fn', 'ln', 'em']) data3 = self.cf.get('3', column_start = 'ct', column_finish = 'fn') data4 = self.cf.get('1', column_reversed = False, column_count = 3) data5 = self.cf.get('1', column_reversed = True, column_count = 3) print data1 print data2 print data3 print data4 print data5 def get_multiple_data(self): print '\ngetting multiple employees data...' row_keys = ['1','2','3'] data = self.cf.multiget(row_keys) print data def get_data_by_range(self): ''' if you get an error don't worry about this, it's a Cassandra limitation Issue ''' print '\ngetting employees data by range...' start_row_key = '1' end_row_key = '3' data = self.cf.get_range(start = start_row_key, finish = end_row_key) for key, columns in data: print key,coulmns def get_count(self): print '\nget employee row\'s colunm count' print self.cf.get_count('1') print self.cf.get_count('1', columns = ['fn', 'ln']) print self.cf.get_count('1', column_start = 'em') def get_multi_count(self): print '\nget multiple employees row\'s colunm count' row_keys = ['1','2','3'] columns = ['fn', 'ln', 'mb'] column_start = 'ct' column_finish = 'fn' print self.cf.multiget_count(row_keys) print self.cf.multiget_count(row_keys, columns = columns) print self.cf.multiget_count(row_keys, column_start = column_start, column_finish = column_finish) def update_data(self): print '\nemployee data is updating...' self.cf.insert('1', {'pwd':'yoku@2010', 'ct':'Noida'}) def delete_data(self): print '\ndelete data from employee' row = '2' self.cf.remove(row) def get_all_rows(self): print '\ngetting rows name...' print [v[0] for v in self.cf.get_range()] def get_all_columns_of_row(self): print '\ngetting columns name of a row' row = '1' data = self.cf.get(row) print data.keys()
import pycassa from pycassa.pool import ConnectionPool from pycassa.columnfamily import ColumnFamily # connecting to Cassandra pool = ConnectionPool('Keyspace1') # getting a ColumnFamily col_fam = ColumnFamily(pool, 'ColumnFamily1') # inserting Data col_fam.insert('row_key', {'col_name':'col_val', 'col_name2':'col_val2'}) # getting Data col_fam.get('row_key') # {'col_name': 'col_val', 'col_name2': 'col_val2'} # counting col_fam.get_count('row_key')
#!/usr/bin/env python import pycassa from pycassa.pool import ConnectionPool from pycassa.columnfamily import ColumnFamily pool = ConnectionPool('Keyspace1',server_list=['localhost:9160']) col_fam = ColumnFamily(pool, 'ColumnFamily1') check1 = col_fam.insert('row_key', {'col_name': 'col_val'}) print check1 print " is the added row into a column\n" check2 = col_fam.insert('row_key', {'col_name':'col_val', 'col_name2':'col_val2'}) print check2 print " Multiple columns are added \n" get_data1 = col_fam.get('row_key') print get_data1 get_data2 = col_fam.get('row_key', columns=['col_name', 'col_name2']) print get_data2 print "\n" print "Slicing\n" for i in range(1, 10): col_fam.insert('row_key', {str(i): 'val'}) print col_fam.get('row_key', column_start='5', column_finish='7') print "\n" print "Counting rows: " print col_fam.get_count('row_key')
dinvCost = float(e1['invCost']) - float(e2['invCost']) if (abs(dinvCost) > epson): fl.write("+invCost:" + str(dinvCost) + '.' + e1['invCost'] + ',' + e2['invCost'] + ':' + str(e1) + ',' + str(e2) + '\n') return dbillingInvoice = float(e1['billingInvoice']) - float(e2['billingInvoice']) if (abs(dbillingInvoice) > epson): fl.write("+billingInvoice:" + str(dbillingInvoice) + ',' + e1['billingInvoice'] + ',' + e2['billingInvoice'] + ':' + str(e1) + ',' + str(e2) + '\n') return c.write(str(e1) + '\n') servers = ['pb036:9160', 'pb037:9160', 'pb038:9160'] pool = ConnectionPool('RSS', server_list = servers, timeout = 1, pool_size=20) meta = ColumnFamily(pool, 'MetaData') counter = ColumnFamily(pool, 'Counter') oid_count = meta.get_count('rss.All') print 'Total oids => ', oid_count oids_gen = meta.xget('rss.All', column_reversed=True, include_timestamp=True) oids = dict(oids_gen) #oids = meta.get('rss.All', column_reversed=True) #print oids home = os.path.expanduser("~") fi = open(home + "/rss/diff_cas.txt", "w") fj = open(home + "/rss/diff_rss.txt", "w") difi = open(home + "/rss/difi.txt", "w") difl = open(home + "/rss/difl.txt", "w") f = open(home + "/rss/good.txt", "w")
class DailyTemporalBloomFilter(object): """Long Range Temporal BloomFilter using a daily resolution. For really high value of expiration (like 60 days) with low requirement on precision. The actual error of this BF will the be native error of the BF + the error related to the coarse aspect of the expiration, since we no longer expires information precisely. Also, as opposed to a classic Bloom Filter, this one will aslo have false positive (reporting membership for a non-member) AND false negative (reporting non-membership for a member). The upper bound of the temporal_error can be theoricaly quite high. However, if the items of the set are uniformly distributed over time, the avg error will be something like 1.0 / expiration """ def __init__(self, capacity, error_rate, expiration, name, snapshot_path, cassandra_session): self.error_rate = error_rate self.capacity = capacity self._initialize_parameters() self.initialize_bitarray() self.count = 0 self.hashed_values = [] self.name = name self.snapshot_path = snapshot_path self.expiration = expiration self.initialize_period() self.snapshot_to_load = None self.ready = False self.warm_period = None self.next_snapshot_load = time.time() self.cassandra_session = cassandra_session self.cassandra_columns_family = "temporal_bf" self.keyspace = 'parsely' self.uncommited_keys = [] self.commit_batch = 1000 self.columnfamily = None self.ensure_cassandra_cf() def _initialize_parameters(self): self.nbr_slices = int(np.ceil(np.log2(1.0 / self.error_rate))) self.bits_per_slice = int(np.ceil((self.capacity * abs(np.log(self.error_rate))) / (self.nbr_slices * (np.log(2) ** 2)))) self.nbr_bits = self.nbr_slices * self.bits_per_slice self.hashes = generate_hashfunctions(self.bits_per_slice, self.nbr_slices) def ensure_cassandra_cf(self): s = SystemManager() if self.keyspace not in s.list_keyspaces(): s.create_keyspace(self.keyspace, SIMPLE_STRATEGY, {'replication_factor': '1'}) if self.cassandra_columns_family not in s.get_keyspace_column_families(self.keyspace): s.create_column_family(self.keyspace, self.cassandra_columns_family) self.columnfamily = ColumnFamily(self.cassandra_session, self.cassandra_columns_family) def archive_bf_key(self, bf_key): self.uncommited_keys.append(bf_key) if len(self.uncommited_keys) >= self.commit_batch: current_period_hour = dt.datetime.now().strftime('%Y-%m-%d:%H') self.columnfamily.insert('%s_%s' % (self.name, current_period_hour), {k:'' for k in self.uncommited_keys}) self.uncommited_keys = [] def _hour_range(self, start, end, reverse=False, inclusive=True): """Generator that gives us all the hours between a start and end datetime (inclusive).""" def total_seconds(td): return (td.microseconds + (td.seconds + td.days * 24.0 * 3600.0) * 10.0**6) / 10.0**6 hours = int(math.ceil(total_seconds(end - start) / (60.0 * 60.0))) if inclusive: hours += 1 for i in xrange(hours): if reverse: yield end - dt.timedelta(hours=i) else: yield start + dt.timedelta(hours=i) def resize(self, new_capacity): self.capacity = new_capacity self._initialize_parameters() self.rebuild_from_archive() def _drop_archive(self): last_period = self.current_period - dt.timedelta(days=self.expiration-1) hours = self._hour_range(last_period, dt.datetime.now()) for hour in hours: try: row = "%s_%s" % (self.name, hour.strftime('%Y-%m-%d:%H')) nbr_keys = self.columnfamily.get_count(row) keys = self.columnfamily.remove(row) except: pass def rebuild_from_archive(self): """Rebuild the BF using the archived items""" self.initialize_bitarray() last_period = self.current_period - dt.timedelta(days=self.expiration-1) hours = self._hour_range(last_period, dt.datetime.now()) rows = [] for i,hour in enumerate(hours): row = "%s_%s" % (self.name, hour.strftime('%Y-%m-%d:%H')) rows.append(row) rows_content = self.columnfamily.multiget(rows, column_count=1E6) for row_content in rows_content.values(): for k in row_content.keys(): self.add(k, rebuild_mode=True) def initialize_bitarray(self): """Initialize both bitarray. This BF contain two bit arrays instead of single one like a plain BF. bitarray is the main bit array where all the historical items are stored. It's the one used for the membership query. The second one, current_day_bitarray is the one used for creating the daily snapshot. """ self.bitarray = bitarray.bitarray(self.nbr_bits) self.current_day_bitarray = bitarray.bitarray(self.nbr_bits) self.bitarray.setall(False) self.current_day_bitarray.setall(False) def __contains__(self, key): """Check membership.""" self.hashed_values = self.hashes(key) offset = 0 for value in self.hashed_values: if not self.bitarray[offset + value]: return False offset += self.bits_per_slice return True def add(self, key, rebuild_mode=False): if not rebuild_mode: self.archive_bf_key(key) if key in self: return True offset = 0 if not self.hashed_values: self.hashed_values = self.hashes(key) for value in self.hashed_values: self.bitarray[offset + value] = True self.current_day_bitarray[offset + value] = True offset += self.bits_per_slice self.count += 1 return False def initialize_period(self, period=None): """Initialize the period of BF. :period: datetime.datetime for setting the period explicity. """ if not period: self.current_period = dt.datetime.now() else: self.current_period = period self.current_period = dt.datetime(self.current_period.year, self.current_period.month, self.current_period.day) self.date = self.current_period.strftime("%Y-%m-%d") def maintenance(self): """Expire the old element of the set. Initialize a new bitarray and load the previous snapshot. Execute this guy at the beginining of each day. """ self.initialize_period() self.initialize_bitarray() self.restore_from_disk() def compute_refresh_period(self): self.warm_period = (60 * 60 * 24) // (self.expiration-2) def _should_warm(self): return time.time() >= self.next_snapshot_load def warm(self, jittering_ratio=0.2): """Progressively load the previous snapshot during the day. Loading all the snapshots at once can takes a substantial amount of time. This method, if called periodically during the day will progressively load those snapshots one by one. Because many workers are going to use this method at the same time, we add a jittering to the period between load to avoid hammering the disk at the same time. """ if self.snapshot_to_load == None: last_period = self.current_period - dt.timedelta(days=self.expiration-1) self.compute_refresh_period() self.snapshot_to_load = [] base_filename = "%s/%s_%s_*.dat" % (self.snapshot_path, self.name, self.expiration) availables_snapshots = glob.glob(base_filename) for filename in availables_snapshots: snapshot_period = dt.datetime.strptime(filename.split('_')[-1].strip('.dat'), "%Y-%m-%d") if snapshot_period >= last_period: self.snapshot_to_load.append(filename) self.ready = False if self.snapshot_to_load and self._should_warm(): filename = self.snapshot_to_load.pop() self._union_bf_from_file(filename) jittering = self.warm_period * (np.random.random()-0.5) * jittering_ratio self.next_snapshot_load = time.time() + self.warm_period + jittering if not self.snapshot_to_load: self.ready = True def _union_bf_from_file(self, filename, current=False): snapshot = cPickle.loads(zlib.decompress(open(filename,'r').read())) if current: self.current_day_bitarray = self.current_day_bitarray | snapshot else: self.bitarray = self.bitarray | snapshot def restore_from_disk(self, clean_old_snapshot=False): """Restore the state of the BF using previous snapshots. :clean_old_snapshot: Delete the old snapshot on the disk (period < current - expiration) """ base_filename = "%s/%s_%s_*.dat" % (self.snapshot_path, self.name, self.expiration) availables_snapshots = glob.glob(base_filename) last_period = self.current_period - dt.timedelta(days=self.expiration-1) for filename in availables_snapshots: snapshot_period = dt.datetime.strptime(filename.split('_')[-1].strip('.dat'), "%Y-%m-%d") if snapshot_period < last_period and not clean_old_snapshot: continue else: self._union_bf_from_file(filename) if snapshot_period == self.current_period: self._union_bf_from_file(filename, current=True) if snapshot_period < last_period and clean_old_snapshot: os.remove(filename) self.ready = True def save_snaphot(self): """Save the current state of the current day bitarray on disk. Save the internal representation (bitarray) into a binary file using this format: filename : name_expiration_2013-01-01.dat """ filename = "%s/%s_%s_%s.dat" % (self.snapshot_path, self.name, self.expiration, self.date) with open(filename, 'w') as f: f.write(zlib.compress(cPickle.dumps(self.current_day_bitarray, protocol=cPickle.HIGHEST_PROTOCOL))) def union_current_day(self, bf): """Union only the current_day of an other BF.""" self.bitarray = self.bitarray | bf.current_day_bitarray
class DailyTemporalBloomFilter(object): """Long Range Temporal BloomFilter using a daily resolution. For really high value of expiration (like 60 days) with low requirement on precision. The actual error of this BF will the be native error of the BF + the error related to the coarse aspect of the expiration, since we no longer expires information precisely. Also, as opposed to a classic Bloom Filter, this one will aslo have false positive (reporting membership for a non-member) AND false negative (reporting non-membership for a member). The upper bound of the temporal_error can be theoricaly quite high. However, if the items of the set are uniformly distributed over time, the avg error will be something like 1.0 / expiration """ def __init__(self, capacity, error_rate, expiration, name, snapshot_path, cassandra_session): self.error_rate = error_rate self.capacity = capacity self._initialize_parameters() self.initialize_bitarray() self.count = 0 self.hashed_values = [] self.name = name self.snapshot_path = snapshot_path self.expiration = expiration self.initialize_period() self.snapshot_to_load = None self.ready = False self.warm_period = None self.next_snapshot_load = time.time() self.cassandra_session = cassandra_session self.cassandra_columns_family = "temporal_bf" self.keyspace = 'parsely' self.uncommited_keys = [] self.commit_batch = 1000 self.columnfamily = None self.ensure_cassandra_cf() def _initialize_parameters(self): self.nbr_slices = int(np.ceil(np.log2(1.0 / self.error_rate))) self.bits_per_slice = int( np.ceil((self.capacity * abs(np.log(self.error_rate))) / (self.nbr_slices * (np.log(2)**2)))) self.nbr_bits = self.nbr_slices * self.bits_per_slice self.hashes = generate_hashfunctions(self.bits_per_slice, self.nbr_slices) def ensure_cassandra_cf(self): s = SystemManager() if self.keyspace not in s.list_keyspaces(): s.create_keyspace(self.keyspace, SIMPLE_STRATEGY, {'replication_factor': '1'}) if self.cassandra_columns_family not in s.get_keyspace_column_families( self.keyspace): s.create_column_family(self.keyspace, self.cassandra_columns_family) self.columnfamily = ColumnFamily(self.cassandra_session, self.cassandra_columns_family) def archive_bf_key(self, bf_key): self.uncommited_keys.append(bf_key) if len(self.uncommited_keys) >= self.commit_batch: current_period_hour = dt.datetime.now().strftime('%Y-%m-%d:%H') self.columnfamily.insert( '%s_%s' % (self.name, current_period_hour), {k: '' for k in self.uncommited_keys}) self.uncommited_keys = [] def _hour_range(self, start, end, reverse=False, inclusive=True): """Generator that gives us all the hours between a start and end datetime (inclusive).""" def total_seconds(td): return (td.microseconds + (td.seconds + td.days * 24.0 * 3600.0) * 10.0**6) / 10.0**6 hours = int(math.ceil(total_seconds(end - start) / (60.0 * 60.0))) if inclusive: hours += 1 for i in xrange(hours): if reverse: yield end - dt.timedelta(hours=i) else: yield start + dt.timedelta(hours=i) def resize(self, new_capacity): self.capacity = new_capacity self._initialize_parameters() self.rebuild_from_archive() def _drop_archive(self): last_period = self.current_period - dt.timedelta(days=self.expiration - 1) hours = self._hour_range(last_period, dt.datetime.now()) for hour in hours: try: row = "%s_%s" % (self.name, hour.strftime('%Y-%m-%d:%H')) nbr_keys = self.columnfamily.get_count(row) keys = self.columnfamily.remove(row) except: pass def rebuild_from_archive(self): """Rebuild the BF using the archived items""" self.initialize_bitarray() last_period = self.current_period - dt.timedelta(days=self.expiration - 1) hours = self._hour_range(last_period, dt.datetime.now()) rows = [] for i, hour in enumerate(hours): row = "%s_%s" % (self.name, hour.strftime('%Y-%m-%d:%H')) rows.append(row) rows_content = self.columnfamily.multiget(rows, column_count=1E6) for row_content in rows_content.values(): for k in row_content.keys(): self.add(k, rebuild_mode=True) def initialize_bitarray(self): """Initialize both bitarray. This BF contain two bit arrays instead of single one like a plain BF. bitarray is the main bit array where all the historical items are stored. It's the one used for the membership query. The second one, current_day_bitarray is the one used for creating the daily snapshot. """ self.bitarray = bitarray.bitarray(self.nbr_bits) self.current_day_bitarray = bitarray.bitarray(self.nbr_bits) self.bitarray.setall(False) self.current_day_bitarray.setall(False) def __contains__(self, key): """Check membership.""" self.hashed_values = self.hashes(key) offset = 0 for value in self.hashed_values: if not self.bitarray[offset + value]: return False offset += self.bits_per_slice return True def add(self, key, rebuild_mode=False): if not rebuild_mode: self.archive_bf_key(key) if key in self: return True offset = 0 if not self.hashed_values: self.hashed_values = self.hashes(key) for value in self.hashed_values: self.bitarray[offset + value] = True self.current_day_bitarray[offset + value] = True offset += self.bits_per_slice self.count += 1 return False def initialize_period(self, period=None): """Initialize the period of BF. :period: datetime.datetime for setting the period explicity. """ if not period: self.current_period = dt.datetime.now() else: self.current_period = period self.current_period = dt.datetime(self.current_period.year, self.current_period.month, self.current_period.day) self.date = self.current_period.strftime("%Y-%m-%d") def maintenance(self): """Expire the old element of the set. Initialize a new bitarray and load the previous snapshot. Execute this guy at the beginining of each day. """ self.initialize_period() self.initialize_bitarray() self.restore_from_disk() def compute_refresh_period(self): self.warm_period = (60 * 60 * 24) // (self.expiration - 2) def _should_warm(self): return time.time() >= self.next_snapshot_load def warm(self, jittering_ratio=0.2): """Progressively load the previous snapshot during the day. Loading all the snapshots at once can takes a substantial amount of time. This method, if called periodically during the day will progressively load those snapshots one by one. Because many workers are going to use this method at the same time, we add a jittering to the period between load to avoid hammering the disk at the same time. """ if self.snapshot_to_load == None: last_period = self.current_period - dt.timedelta( days=self.expiration - 1) self.compute_refresh_period() self.snapshot_to_load = [] base_filename = "%s/%s_%s_*.dat" % (self.snapshot_path, self.name, self.expiration) availables_snapshots = glob.glob(base_filename) for filename in availables_snapshots: snapshot_period = dt.datetime.strptime( filename.split('_')[-1].strip('.dat'), "%Y-%m-%d") if snapshot_period >= last_period: self.snapshot_to_load.append(filename) self.ready = False if self.snapshot_to_load and self._should_warm(): filename = self.snapshot_to_load.pop() self._union_bf_from_file(filename) jittering = self.warm_period * (np.random.random() - 0.5) * jittering_ratio self.next_snapshot_load = time.time( ) + self.warm_period + jittering if not self.snapshot_to_load: self.ready = True def _union_bf_from_file(self, filename, current=False): snapshot = cPickle.loads(zlib.decompress(open(filename, 'r').read())) if current: self.current_day_bitarray = self.current_day_bitarray | snapshot else: self.bitarray = self.bitarray | snapshot def restore_from_disk(self, clean_old_snapshot=False): """Restore the state of the BF using previous snapshots. :clean_old_snapshot: Delete the old snapshot on the disk (period < current - expiration) """ base_filename = "%s/%s_%s_*.dat" % (self.snapshot_path, self.name, self.expiration) availables_snapshots = glob.glob(base_filename) last_period = self.current_period - dt.timedelta(days=self.expiration - 1) for filename in availables_snapshots: snapshot_period = dt.datetime.strptime( filename.split('_')[-1].strip('.dat'), "%Y-%m-%d") if snapshot_period < last_period and not clean_old_snapshot: continue else: self._union_bf_from_file(filename) if snapshot_period == self.current_period: self._union_bf_from_file(filename, current=True) if snapshot_period < last_period and clean_old_snapshot: os.remove(filename) self.ready = True def save_snaphot(self): """Save the current state of the current day bitarray on disk. Save the internal representation (bitarray) into a binary file using this format: filename : name_expiration_2013-01-01.dat """ filename = "%s/%s_%s_%s.dat" % (self.snapshot_path, self.name, self.expiration, self.date) with open(filename, 'w') as f: f.write( zlib.compress( cPickle.dumps(self.current_day_bitarray, protocol=cPickle.HIGHEST_PROTOCOL))) def union_current_day(self, bf): """Union only the current_day of an other BF.""" self.bitarray = self.bitarray | bf.current_day_bitarray
def get_count(self, *args, **kwargs): col_fam = ColumnFamily(self.pool, self.__column_family__) return col_fam.get_count(*args, **kwargs)
name_cf = ColumnFamily(pool, 'myname') x = ['acharya1', 'acharya2'] name_cf.insert('sacharya3', {'last_name': x}) names3 = name_cf.get('sacharya3') print "List as a value" print names3 attrs = dict([(attr_name, set([attr_values])) for attr_name, attr_values in names3.iteritems()]) name_cf.insert("sacharya3", {'last_name': attrs['last_name'].append("acharya3")}) print name_cf.get('sacharya3') ################################# COUNT ####################################### # Count the number of columns for the row key count=author_cf.get_count("sacharya1") print count count=author_cf.multiget_count(["sacharya1","sacharya2"]) print count ################################## REMOVE ##################################### # Remove the column for the row key and column key print "Removing the column last_name for row key sacharya1" author_cf.remove('sacharya1', columns=['last_name']) time.sleep(5) authors = author_cf.get('sacharya') print authors # REMOVE the entire row