Exemplo n.º 1
0
    def multiget(self, *args, **kwargs):
        """
        Like :meth:`get()`, but a list of keys may be specified.

        The result of multiget will be a dictionary where the keys
        are the keys from the `keys` argument, minus any missing rows.
        The value for each key in the dictionary will be the same as
        if :meth:`get()` were called on that individual key.

        """
        if 'columns' not in kwargs and not self.super and not self.raw_columns:
            kwargs['columns'] = self.fields

        kcmap = ColumnFamily.multiget(self, *args, **kwargs)
        ret = self.dict_class()
        for key, columns in kcmap.iteritems():
            if self.super:
                if 'super_column' not in kwargs:
                    vals = self.dict_class()
                    for super_column, subcols in columns.iteritems():
                        combined = self.combine_columns(subcols)
                        vals[super_column] = create_instance(self.cls, key=key, super_column=super_column, **combined)
                    ret[key] = vals
                else:
                    combined = self.combine_columns(columns)
                    ret[key] = create_instance(self.cls, key=key, super_column=kwargs['super_column'], **combined)
            else:
                combined = self.combine_columns(columns)
                ret[key] = create_instance(self.cls, key=key, **combined)
        return ret
def get():
#################### TEMP
  #userId = 'user-784b9158-5233-454e-8dcf-c229cdff12c6'
  print 'Getting result for userId: {0} between time {1} and {2}'.format(userId, startTime, startTime)
  con = util.getConnection()
  logCF = ColumnFamily(con, 'event_log_mux')
  
  rowKeys = ['{0}:{1}'.format(userId, i+1) for i in range(4)]
  rows = logCF.multiget(rowKeys)
  
  print 'Shows rows multiplexes into different rows each individually sorted in reverse cronological order:'
  merge = {}
  for row in rows:
    print '>> '+str(row) 
    merge = dict(merge.items() + rows[row].items())
    for col in rows[row]:
      colstr = rows[row][col]
      coljson = json.loads(colstr)
      print '\tInsertion Timestamp: {0}'.format(coljson['insert_time'])
  
  final = collections.OrderedDict(sorted(merge.items(), reverse=True))
  for k,v in final.iteritems():
    coljson = json.loads(v)
    print 'insertion timestamp: {0}'.format(coljson['insert_time'])

  """
Exemplo n.º 3
0
    def multiget(self, *args, **kwargs):
        """
        Like :meth:`get()`, but a list of keys may be specified.

        The result of multiget will be a dictionary where the keys
        are the keys from the `keys` argument, minus any missing rows.
        The value for each key in the dictionary will be the same as
        if :meth:`get()` were called on that individual key.

        """
        if 'columns' not in kwargs and not self.super and not self.raw_columns:
            kwargs['columns'] = self.fields

        kcmap = ColumnFamily.multiget(self, *args, **kwargs)
        ret = self.dict_class()
        for key, columns in kcmap.iteritems():
            if self.super:
                if 'super_column' not in kwargs:
                    vals = self.dict_class()
                    for super_column, subcols in columns.iteritems():
                        combined = self.combine_columns(subcols)
                        vals[super_column] = create_instance(self.cls, key=key, super_column=super_column, **combined)
                    ret[key] = vals
                else:
                    combined = self.combine_columns(columns)
                    ret[key] = create_instance(self.cls, key=key, super_column=kwargs['super_column'], **combined)
            else:
                combined = self.combine_columns(columns)
                ret[key] = create_instance(self.cls, key=key, **combined)
        return ret
Exemplo n.º 4
0
 def get_data(self, cf_name, key, start_time, end_time, output_json=False):
     cf = ColumnFamily(self.pool, cf_name)
     try:
         result = cf.multiget(self.gen_key_range(key, start_time, end_time), column_start=start_time*1000, column_finish=end_time*1000, column_count=10000000)
         if output_json:
             self.dump_json(result)
     except NotFoundException:
         pass
    def file_metadata(self, keys):
        """Obtain metadata for a stored file.

        Argument is an iterable of file keys whose data to obtain.
        """
        cf = ColumnFamily(self.pool, 'files')

        return cf.multiget(keys)
Exemplo n.º 6
0
 def get_data(self, cf_name, key, start_time, end_time, output_json=False):
     cf = ColumnFamily(self.pool, cf_name)
     try:
         result = cf.multiget(self.gen_key_range(key, start_time, end_time),
                              column_start=start_time * 1000,
                              column_finish=end_time * 1000,
                              column_count=10000000)
         if output_json:
             self.dump_json(result)
     except NotFoundException:
         pass
Exemplo n.º 7
0
    def _get_oldest_thread(self):
        threads = ColumnFamily(self.conn, 'threads')

        # get all row keys
        row_keys = []
        ret = list(threads.get_range())
        for v in ret:
            row_keys.append(v[0])

        result = []
        ret = threads.multiget(row_keys)
        for key in row_keys:
            row = {}
            row['thread_id'] = int(key)
            row['thread_name'] = int(key)
            row['post_count'] = ret[key]['post_count']
            row['create_time'] = ret[key]['create_time']
            row['update_time'] = ret[key]['update_time']
            result.append(row)

        result.sort(cmp=lambda x,y: cmp(x['update_time'], y['update_time']))
        return result[0]
Exemplo n.º 8
0
    def get_all_posts_in_thread(self, thread_id):
        posts = ColumnFamily(self.conn, str(thread_id))

        # get all row keys
        row_keys = []
        ret = list(posts.get_range())
        for v in ret:
            row_keys.append(v[0])

        # get all row data
        result = []
        ret = posts.multiget(row_keys)
        for key in row_keys:
            row = {}
            row['key'] = int(key)
            row['name'] = ret[key]['name']
            row['content'] = ret[key]['content']
            row['post_time'] = ret[key]['post_time']
            result.append(row)

        result.sort(cmp=lambda x,y: cmp(x['key'], y['key']))
        return result
Exemplo n.º 9
0
class DailyTemporalBloomFilter(DailyTemporalBase):
    """Long Range Temporal BloomFilter using a daily resolution.

    For really high value of expiration (like 60 days) with low requirement on precision.
    The actual error of this BF will the be native error of the BF + the error related
    to the coarse aspect of the expiration, since we no longer expires information precisely.
    Also, as opposed to a classic Bloom Filter, this one will aslo have false positive (reporting membership for a non-member)
    AND false negative (reporting non-membership for a member).

    The upper bound of the temporal_error can be theoricaly quite high. However, if the
    items of the set are uniformly distributed over time, the avg error will be something like 1.0 / expiration
    """
    def __new__(cls,
                capacity,
                error_rate,
                expiration,
                name,
                cassandra_session,
                snapshot_path='./'):
        return super(DailyTemporalBloomFilter,
                     cls).__new__(cls,
                                  capacity=capacity,
                                  error_rate=error_rate)

    def __init__(self,
                 capacity,
                 error_rate,
                 expiration,
                 name,
                 cassandra_session,
                 snapshot_path='./'):
        filename = ""
        super(DailyTemporalBloomFilter, self).__init__(capacity=capacity,
                                                       error_rate=error_rate)
        self.bf_name = name
        self.expiration = expiration
        self.initialize_period()
        self.cassandra_session = cassandra_session
        self.cassandra_columns_family = "temporal_bf"
        self.keyspace = 'parsely'
        self.uncommited_keys = []
        self.commit_batch = 1000
        self.columnfamily = None
        self.ensure_cassandra_cf()
        self.snapshot_path = snapshot_path

    def ensure_cassandra_cf(self):
        s = SystemManager()
        if self.keyspace not in s.list_keyspaces():
            s.create_keyspace(self.keyspace, SIMPLE_STRATEGY,
                              {'replication_factor': '1'})
        if self.cassandra_columns_family not in s.get_keyspace_column_families(
                self.keyspace):
            s.create_column_family(self.keyspace,
                                   self.cassandra_columns_family)
        self.columnfamily = ColumnFamily(self.cassandra_session,
                                         self.cassandra_columns_family)

    def archive_bf_key(self, bf_key):
        self.uncommited_keys.append(bf_key)
        if len(self.uncommited_keys) >= self.commit_batch:
            current_period_hour = dt.datetime.now().strftime('%Y-%m-%d:%H')
            self.columnfamily.insert(
                '%s_%s' % (self.bf_name, current_period_hour),
                {k: ''
                 for k in self.uncommited_keys})
            self.uncommited_keys = []

    def _hour_range(self, start, end, reverse=False, inclusive=True):
        """Generator that gives us all the hours between a start and end datetime
        (inclusive)."""
        def total_seconds(td):
            return (td.microseconds +
                    (td.seconds + td.days * 24.0 * 3600.0) * 10.0**6) / 10.0**6

        hours = int(math.ceil(total_seconds(end - start) / (60.0 * 60.0)))
        if inclusive:
            hours += 1
        for i in xrange(hours):
            if reverse:
                yield end - dt.timedelta(hours=i)
            else:
                yield start + dt.timedelta(hours=i)

    def _day_range(self, start, end, reverse=False, inclusive=True):
        """Generator that gives us all the days between a start and end datetime
        (inclusive)."""
        days = (end - start).days
        if inclusive:
            days += 1
        for i in xrange(days):
            if reverse:
                yield end - dt.timedelta(days=i)
            else:
                yield start + dt.timedelta(days=i)

    def _drop_archive(self):
        last_period = self.current_period - dt.timedelta(days=self.expiration -
                                                         1)
        hours = self._hour_range(last_period, dt.datetime.now())
        for hour in hours:
            try:
                row = "%s_%s" % (self.bf_name, hour.strftime('%Y-%m-%d:%H'))
                nbr_keys = self.columnfamily.get_count(row)
                keys = self.columnfamily.remove(row)
            except:
                pass

    def rebuild_from_archive(self, rebuild_snapshot=True):
        """Rebuild the BF using the archived items"""
        self.initialize_bitarray()

        #if rebuild_snapshot:
        #    self.delete_snapshots()

        def multi_rows_itr(rows):
            for row in rows.values():
                for k in row.keys():
                    yield k

        last_period = self.current_period - dt.timedelta(days=self.expiration -
                                                         1)
        hours = self._hour_range(last_period, dt.datetime.now())
        days = self._day_range(last_period, dt.datetime.now())
        rows = []
        for i, day in enumerate(days):
            rows = [
                "%s_%s:%s" % (self.bf_name, day.strftime('%Y-%m-%d'), hour_str)
                for hour_str in ["%02d" % i for i in range(24)]
            ]
            rows_content = self.columnfamily.multiget(rows, column_count=1E6)
            update_current = day == self.current_period

            for k in multi_rows_itr(rows_content):
                self.add_rebuild(k, update_current)

            if rebuild_snapshot:
                self.save_snaphot(override_period=day)

            if not update_current:
                self.initialize_current_day_bitarray()

    def restore_from_disk(self, clean_old_snapshot=False):
        """Restore the state of the BF using previous snapshots.

        :clean_old_snapshot: Delete the old snapshot on the disk (period < current - expiration)
        """
        base_filename = "%s/%s_%s_*.dat" % (self.snapshot_path, self.bf_name,
                                            self.expiration)
        availables_snapshots = glob.glob(base_filename)
        last_period = self.current_period - dt.timedelta(days=self.expiration -
                                                         1)
        for filename in availables_snapshots:
            snapshot_period = dt.datetime.strptime(
                filename.split('_')[-1].strip('.dat'), "%Y-%m-%d")
            if snapshot_period < last_period and not clean_old_snapshot:
                continue
            else:
                self._union_bf_from_file(filename)
                if snapshot_period == self.current_period:
                    self._union_bf_from_file(filename, current=True)

            if snapshot_period < last_period and clean_old_snapshot:
                os.remove(filename)
        self.ready = True

    def add_rebuild(self, key, update_current=True):
        super(DailyTemporalBloomFilter, self).add(key, update_current)

    def add(self, key_string):
        if isinstance(key_string, unicode):
            key = key_string.encode('utf8')
        else:
            key = key_string

        self.archive_bf_key(key)
        result = super(DailyTemporalBloomFilter, self).add(key)

        return result

    def resize(self, new_capacity=None, new_error_rate=None):
        self._set_capacity(new_capacity or self.capacity)
        self._set_error_rate(new_error_rate or self.error_rate)
        self._initialize_parameters()
        self.initialize_bitarray()
        self.rebuild_from_archive(rebuild_snapshot=True)

    def initialize_period(self, period=None):
        """Initialize the period of BF.

        :period: datetime.datetime for setting the period explicity.
        """
        if not period:
            self.current_period = dt.datetime.now()
        else:
            self.current_period = period
        self.current_period = dt.datetime(self.current_period.year,
                                          self.current_period.month,
                                          self.current_period.day)
        self.date = self.current_period.strftime("%Y-%m-%d")

    def save_snaphot(self, override_period=None):
        """Save the current state of the current day bitarray on disk.

        Save the internal representation (bitarray) into a binary file using this format:
            filename : name_expiration_2013-01-01.dat
        """
        period = override_period or self.current_period
        filename = "%s/%s_%s_%s.dat" % (self.snapshot_path, self.bf_name,
                                        self.expiration,
                                        period.strftime("%Y-%m-%d"))
        self._save_snapshot(filename)
Exemplo n.º 10
0
    print col_fam.get_count('author')
    col_fam.insert('row_key', {'col_name': 'col_val'})
    col_fam.insert('row_key', {'col_name':'col_val', 'col_name2':'col_val2'})
    col_fam.batch_insert({'row1': {'name1': 'val1', 'name2': 'val2'},'row2': {'foo': 'bar'}})
    #col_fam.insert('super_key', {'key':{'col_name':'col_val', 'col_name2':'col_val2'}})
    print col_fam.get_count('row_key', columns=['foo', 'bar'])
    print col_fam.get_count('row_key', column_start='foo') 
    print col_fam.multiget_count(['fib0', 'fib1', 'fib2', 'fib3', 'fib4'])
    print col_fam.multiget_count(['fib0', 'fib1', 'fib2', 'fib3', 'fib4'],columns=['col1', 'col2', 'col3'])
    print col_fam.multiget_count(['fib0', 'fib1', 'fib2', 'fib3', 'fib4'],column_start='col1', column_finish='col3')
    print col_fam.get_count('row_key')
    print col_fam.get('row_key')
    print col_fam.get('author')
    print col_fam.get('row_key', columns=['col_name', 'col_name2'])
    print col_fam.get('row_key', column_reversed=True, column_count=3)
    print col_fam.multiget(['row1', 'row2'])
    for i in range(1, 10):
        col_fam.insert('row_key', {str(i): 'val'})
    print col_fam.get('row_key', column_start='5', column_finish='7')
    result = col_fam.get_range(start='row_key5', finish='row_key7') 
    for key, columns in result:
        print key, '=>', columns
    #Supper column
#    col_fam = pycassa.ColumnFamily(pool, 'Super1')
#    col_fam.insert('row_key', {'supercol_name': {'col_name': 'col_val'}})
    print col_fam.get('row_key')
#    col_fam = pycassa.ColumnFamily(pool, 'Letters')
#    col_fam.insert('row_key', {'super': {'a': '1', 'b': '2', 'c': '3'}})
#    print col_fam.get('row_key', super_column='super')
#    print col_fam.get('row_key', super_column='super', columns=['a', 'b'])
#    print col_fam.get('row_key', super_column='super', column_start='b')
Exemplo n.º 11
0
class CassandraDemo(object):
    def __init__(self, database, table):
        self.database = database
        self.table = table

    def create_connections(self):
        self.pool = ConnectionPool(self.database)
        self.cf = ColumnFamily(self.pool, self.table)

    def create_database_and_table(self):
        super_cf = False # consider super columns to be deprecated
        s = SystemManager()

        # create keyspace if it doesn't exist
        if database not in s.list_keyspaces():
            s.create_keyspace(database, SIMPLE_STRATEGY, {'replication_factor': '1'})

        # delete column family from the keyspace if it does exist.
        if table in s.get_keyspace_column_families(database):
            s.drop_column_family(database, table)

        # create coulmn family in the keyspace
        if table not in s.get_keyspace_column_families(database):
            print("table is creating...")
            s.create_column_family(database, table, super = super_cf, comparator_type = ASCII_TYPE)
        s.close()

        return True

    def insert_data(self):
        print '\nemployee data is inserting...'
        self.cf.insert('1', {'fn':'yogesh', 'ln':'kumar', 'ct': 'Ajmer', 'em': '*****@*****.**'})
        self.cf.insert('2', {'fn':'amit', 'ln':'pandita', 'ct': 'Delhi', 'em': '*****@*****.**'})
        self.cf.insert('3', {'fn':'sandeep', 'ln':'tak', 'ct': 'Ajmer', 'em': '*****@*****.**', 'mb': '8890467032'})


    def get_data(self):
        print '\nemployee data is featching...'
        data1 = self.cf.get('1')
        data2 = self.cf.get('2', columns = ['fn', 'ln', 'em'])
        data3 = self.cf.get('3', column_start = 'ct', column_finish = 'fn')
        data4 = self.cf.get('1', column_reversed = False, column_count = 3)
        data5 = self.cf.get('1', column_reversed = True, column_count = 3)
        print data1
        print data2
        print data3
        print data4
        print data5

    def get_multiple_data(self):
        print '\ngetting multiple employees data...'
        row_keys = ['1','2','3']
        data = self.cf.multiget(row_keys)
        print data

    def get_data_by_range(self):
        '''
        if you get an error don't worry about this, it's a Cassandra limitation Issue
        '''
        print '\ngetting employees data by range...'
        start_row_key = '1'
        end_row_key = '3'
        data = self.cf.get_range(start = start_row_key, finish = end_row_key)
        for key, columns in data:
            print key,coulmns

    def get_count(self):
        print '\nget employee row\'s colunm count'
        print self.cf.get_count('1')
        print self.cf.get_count('1', columns = ['fn', 'ln'])
        print self.cf.get_count('1', column_start = 'em')

    def get_multi_count(self):
        print '\nget multiple employees row\'s colunm count'
        row_keys = ['1','2','3']
        columns = ['fn', 'ln', 'mb']
        column_start = 'ct'
        column_finish = 'fn'
        print self.cf.multiget_count(row_keys)
        print self.cf.multiget_count(row_keys, columns = columns)
        print self.cf.multiget_count(row_keys, column_start = column_start, column_finish = column_finish)

    def update_data(self):
        print '\nemployee data is updating...'
        self.cf.insert('1', {'pwd':'yoku@2010', 'ct':'Noida'})


    def delete_data(self):
        print '\ndelete data from employee'
        row = '2'
        self.cf.remove(row)

    def get_all_rows(self):
        print '\ngetting rows name...'
        print [v[0] for v in self.cf.get_range()]

    def get_all_columns_of_row(self):
        print '\ngetting columns name of a row'
        row = '1'
        data = self.cf.get(row)
        print data.keys()
Exemplo n.º 12
0
#!/usr/bin/python
#-*- coding:utf-8 -*-
import csv
import glob
from pycassa.system_manager import *
from pycassa.pool import ConnectionPool
from pycassa.columnfamily import ColumnFamily

pool = ConnectionPool('employees')
filenames = glob.glob('employees/*.csv')
for filename in filenames:
    only_name = filename.split('/')[-1].split('.')[0]
    print only_name
    cf = ColumnFamily(pool, only_name)
    print cf.multiget(['row1', 'row2'])
    break
Exemplo n.º 13
0
from pycassa.pool import ConnectionPool
from pycassa.columnfamily import ColumnFamily

pool = ConnectionPool("pykeyspace", ["localhost:9160"])
col_family = ColumnFamily(pool, "UserInfo")
col_family.insert("dosht2", {"email": "*****@*****.**", "name": "mostafa"})
# print col_family.get("dosht2", columns=["email"])['email']
print col_family.get("dosht2")
b = col_family.batch()
b.insert("dodo", {"email": "*****@*****.**"})
b.remove("dosht2", ["name"])
b.send()
print col_family.get("dosht2")
print col_family.multiget(["dosht", "dodo"])["dodo"]

from pycassa.types import *


class User(object):
    key = UTF8Type()  # name key is mandatory
    email = AsciiType()
    age = IntegerType()

    def __repr__(self):
        return "User(key: %s, email: %s, age: %s)" % (self.key, self.email, self.age)


from pycassa.columnfamilymap import ColumnFamilyMap

cfmap = ColumnFamilyMap(User, pool, "UserInfo")
user = User()
Exemplo n.º 14
0
 def multiget(self, *args, **kwargs):
     col_fam = ColumnFamily(self.pool, self.__column_family__)
     return col_fam.multiget(*args, **kwargs)
Exemplo n.º 15
0
#!/usr/bin/python
#-*- coding:utf-8 -*-
import csv
import glob
from pycassa.system_manager import *
from pycassa.pool import ConnectionPool
from pycassa.columnfamily import ColumnFamily

pool = ConnectionPool('employees')
filenames = glob.glob('employees/*.csv')
for filename in filenames:
    only_name = filename.split('/')[-1].split('.')[0]
    print only_name
    cf = ColumnFamily(pool, only_name)
    print cf.multiget(['row1','row2'])
    break
Exemplo n.º 16
0
################################### GET #######################################
# Get the row for the rowkey
authors = author_cf.get('sacharya')
print authors

# Get value for column
print "Get value for column"
authors = author_cf.get('sacharya1', columns=['first_name'])
print authors

# Get the colums for the row key and column key
authors = author_cf.get('sacharya1', columns=['first_name', 'last_name'])
print authors

authors = author_cf.multiget('sacharya', 'sacharya1')
print authors

print "Printing the keys"
keys = authors.keys()
for k in keys:
    print authors.get(k)
print "Keys printed"

#authors = list(author.get_range().get_keys())
for value in author_cf.get_range():
    print value[0]

# Only if using OrderPreservingPartitioner. Default is RandomPartitioner, which
# does md5 on the key
#for value in author_cf.get_range(start='sacharya5', finish='sacharya10'):
Exemplo n.º 17
0
class DailyTemporalBloomFilter(object):
    """Long Range Temporal BloomFilter using a daily resolution.

    For really high value of expiration (like 60 days) with low requirement on precision.
    The actual error of this BF will the be native error of the BF + the error related
    to the coarse aspect of the expiration, since we no longer expires information precisely.
    Also, as opposed to a classic Bloom Filter, this one will aslo have false positive (reporting membership for a non-member)
    AND false negative (reporting non-membership for a member).

    The upper bound of the temporal_error can be theoricaly quite high. However, if the
    items of the set are uniformly distributed over time, the avg error will be something like 1.0 / expiration
    """
    def __init__(self, capacity, error_rate, expiration, name, snapshot_path,
                 cassandra_session):
        self.error_rate = error_rate
        self.capacity = capacity
        self._initialize_parameters()
        self.initialize_bitarray()
        self.count = 0
        self.hashed_values = []
        self.name = name
        self.snapshot_path = snapshot_path
        self.expiration = expiration
        self.initialize_period()
        self.snapshot_to_load = None
        self.ready = False
        self.warm_period = None
        self.next_snapshot_load = time.time()
        self.cassandra_session = cassandra_session
        self.cassandra_columns_family = "temporal_bf"
        self.keyspace = 'parsely'
        self.uncommited_keys = []
        self.commit_batch = 1000
        self.columnfamily = None
        self.ensure_cassandra_cf()

    def _initialize_parameters(self):
        self.nbr_slices = int(np.ceil(np.log2(1.0 / self.error_rate)))
        self.bits_per_slice = int(
            np.ceil((self.capacity * abs(np.log(self.error_rate))) /
                    (self.nbr_slices * (np.log(2)**2))))
        self.nbr_bits = self.nbr_slices * self.bits_per_slice
        self.hashes = generate_hashfunctions(self.bits_per_slice,
                                             self.nbr_slices)

    def ensure_cassandra_cf(self):
        s = SystemManager()
        if self.keyspace not in s.list_keyspaces():
            s.create_keyspace(self.keyspace, SIMPLE_STRATEGY,
                              {'replication_factor': '1'})
        if self.cassandra_columns_family not in s.get_keyspace_column_families(
                self.keyspace):
            s.create_column_family(self.keyspace,
                                   self.cassandra_columns_family)
        self.columnfamily = ColumnFamily(self.cassandra_session,
                                         self.cassandra_columns_family)

    def archive_bf_key(self, bf_key):
        self.uncommited_keys.append(bf_key)
        if len(self.uncommited_keys) >= self.commit_batch:
            current_period_hour = dt.datetime.now().strftime('%Y-%m-%d:%H')
            self.columnfamily.insert(
                '%s_%s' % (self.name, current_period_hour),
                {k: ''
                 for k in self.uncommited_keys})
            self.uncommited_keys = []

    def _hour_range(self, start, end, reverse=False, inclusive=True):
        """Generator that gives us all the hours between a start and end datetime
        (inclusive)."""
        def total_seconds(td):
            return (td.microseconds +
                    (td.seconds + td.days * 24.0 * 3600.0) * 10.0**6) / 10.0**6

        hours = int(math.ceil(total_seconds(end - start) / (60.0 * 60.0)))
        if inclusive:
            hours += 1
        for i in xrange(hours):
            if reverse:
                yield end - dt.timedelta(hours=i)
            else:
                yield start + dt.timedelta(hours=i)

    def resize(self, new_capacity):
        self.capacity = new_capacity
        self._initialize_parameters()
        self.rebuild_from_archive()

    def _drop_archive(self):
        last_period = self.current_period - dt.timedelta(days=self.expiration -
                                                         1)
        hours = self._hour_range(last_period, dt.datetime.now())
        for hour in hours:
            try:
                row = "%s_%s" % (self.name, hour.strftime('%Y-%m-%d:%H'))
                nbr_keys = self.columnfamily.get_count(row)
                keys = self.columnfamily.remove(row)
            except:
                pass

    def rebuild_from_archive(self):
        """Rebuild the BF using the archived items"""
        self.initialize_bitarray()
        last_period = self.current_period - dt.timedelta(days=self.expiration -
                                                         1)
        hours = self._hour_range(last_period, dt.datetime.now())
        rows = []
        for i, hour in enumerate(hours):
            row = "%s_%s" % (self.name, hour.strftime('%Y-%m-%d:%H'))
            rows.append(row)
        rows_content = self.columnfamily.multiget(rows, column_count=1E6)

        for row_content in rows_content.values():
            for k in row_content.keys():
                self.add(k, rebuild_mode=True)

    def initialize_bitarray(self):
        """Initialize both bitarray.

        This BF contain two bit arrays instead of single one like a plain BF. bitarray
        is the main bit array where all the historical items are stored. It's the one
        used for the membership query. The second one, current_day_bitarray is the one
        used for creating the daily snapshot.
        """
        self.bitarray = bitarray.bitarray(self.nbr_bits)
        self.current_day_bitarray = bitarray.bitarray(self.nbr_bits)
        self.bitarray.setall(False)
        self.current_day_bitarray.setall(False)

    def __contains__(self, key):
        """Check membership."""
        self.hashed_values = self.hashes(key)
        offset = 0
        for value in self.hashed_values:
            if not self.bitarray[offset + value]:
                return False
            offset += self.bits_per_slice
        return True

    def add(self, key, rebuild_mode=False):
        if not rebuild_mode:
            self.archive_bf_key(key)
        if key in self:
            return True
        offset = 0
        if not self.hashed_values:
            self.hashed_values = self.hashes(key)
        for value in self.hashed_values:
            self.bitarray[offset + value] = True
            self.current_day_bitarray[offset + value] = True
            offset += self.bits_per_slice
        self.count += 1
        return False

    def initialize_period(self, period=None):
        """Initialize the period of BF.

        :period: datetime.datetime for setting the period explicity.
        """
        if not period:
            self.current_period = dt.datetime.now()
        else:
            self.current_period = period
        self.current_period = dt.datetime(self.current_period.year,
                                          self.current_period.month,
                                          self.current_period.day)
        self.date = self.current_period.strftime("%Y-%m-%d")

    def maintenance(self):
        """Expire the old element of the set.

        Initialize a new bitarray and load the previous snapshot. Execute this guy
        at the beginining of each day.
        """
        self.initialize_period()
        self.initialize_bitarray()
        self.restore_from_disk()

    def compute_refresh_period(self):
        self.warm_period = (60 * 60 * 24) // (self.expiration - 2)

    def _should_warm(self):
        return time.time() >= self.next_snapshot_load

    def warm(self, jittering_ratio=0.2):
        """Progressively load the previous snapshot during the day.

        Loading all the snapshots at once can takes a substantial amount of time. This method, if called
        periodically during the day will progressively load those snapshots one by one. Because many workers are
        going to use this method at the same time, we add a jittering to the period between load to avoid
        hammering the disk at the same time.
        """
        if self.snapshot_to_load == None:
            last_period = self.current_period - dt.timedelta(
                days=self.expiration - 1)
            self.compute_refresh_period()
            self.snapshot_to_load = []
            base_filename = "%s/%s_%s_*.dat" % (self.snapshot_path, self.name,
                                                self.expiration)
            availables_snapshots = glob.glob(base_filename)
            for filename in availables_snapshots:
                snapshot_period = dt.datetime.strptime(
                    filename.split('_')[-1].strip('.dat'), "%Y-%m-%d")
                if snapshot_period >= last_period:
                    self.snapshot_to_load.append(filename)
                    self.ready = False

        if self.snapshot_to_load and self._should_warm():
            filename = self.snapshot_to_load.pop()
            self._union_bf_from_file(filename)
            jittering = self.warm_period * (np.random.random() -
                                            0.5) * jittering_ratio
            self.next_snapshot_load = time.time(
            ) + self.warm_period + jittering
            if not self.snapshot_to_load:
                self.ready = True

    def _union_bf_from_file(self, filename, current=False):
        snapshot = cPickle.loads(zlib.decompress(open(filename, 'r').read()))
        if current:
            self.current_day_bitarray = self.current_day_bitarray | snapshot
        else:
            self.bitarray = self.bitarray | snapshot

    def restore_from_disk(self, clean_old_snapshot=False):
        """Restore the state of the BF using previous snapshots.

        :clean_old_snapshot: Delete the old snapshot on the disk (period < current - expiration)
        """
        base_filename = "%s/%s_%s_*.dat" % (self.snapshot_path, self.name,
                                            self.expiration)
        availables_snapshots = glob.glob(base_filename)
        last_period = self.current_period - dt.timedelta(days=self.expiration -
                                                         1)
        for filename in availables_snapshots:
            snapshot_period = dt.datetime.strptime(
                filename.split('_')[-1].strip('.dat'), "%Y-%m-%d")
            if snapshot_period < last_period and not clean_old_snapshot:
                continue
            else:
                self._union_bf_from_file(filename)
                if snapshot_period == self.current_period:
                    self._union_bf_from_file(filename, current=True)

            if snapshot_period < last_period and clean_old_snapshot:
                os.remove(filename)
        self.ready = True

    def save_snaphot(self):
        """Save the current state of the current day bitarray on disk.

        Save the internal representation (bitarray) into a binary file using this format:
            filename : name_expiration_2013-01-01.dat
        """
        filename = "%s/%s_%s_%s.dat" % (self.snapshot_path, self.name,
                                        self.expiration, self.date)
        with open(filename, 'w') as f:
            f.write(
                zlib.compress(
                    cPickle.dumps(self.current_day_bitarray,
                                  protocol=cPickle.HIGHEST_PROTOCOL)))

    def union_current_day(self, bf):
        """Union only the current_day of an other BF."""
        self.bitarray = self.bitarray | bf.current_day_bitarray
Exemplo n.º 18
0
class DailyTemporalBloomFilter(object):
    """Long Range Temporal BloomFilter using a daily resolution.

    For really high value of expiration (like 60 days) with low requirement on precision.
    The actual error of this BF will the be native error of the BF + the error related
    to the coarse aspect of the expiration, since we no longer expires information precisely.
    Also, as opposed to a classic Bloom Filter, this one will aslo have false positive (reporting membership for a non-member)
    AND false negative (reporting non-membership for a member).

    The upper bound of the temporal_error can be theoricaly quite high. However, if the
    items of the set are uniformly distributed over time, the avg error will be something like 1.0 / expiration
    """

    def __init__(self, capacity, error_rate, expiration, name, snapshot_path, cassandra_session):
        self.error_rate = error_rate
        self.capacity = capacity
        self._initialize_parameters()
        self.initialize_bitarray()
        self.count = 0
        self.hashed_values = []
        self.name = name
        self.snapshot_path = snapshot_path
        self.expiration = expiration
        self.initialize_period()
        self.snapshot_to_load = None
        self.ready = False
        self.warm_period = None
        self.next_snapshot_load = time.time()
        self.cassandra_session = cassandra_session
        self.cassandra_columns_family = "temporal_bf"
        self.keyspace = 'parsely'
        self.uncommited_keys = []
        self.commit_batch = 1000
        self.columnfamily = None
        self.ensure_cassandra_cf()

    def _initialize_parameters(self):
        self.nbr_slices = int(np.ceil(np.log2(1.0 / self.error_rate)))
        self.bits_per_slice = int(np.ceil((self.capacity * abs(np.log(self.error_rate))) / (self.nbr_slices * (np.log(2) ** 2))))
        self.nbr_bits = self.nbr_slices * self.bits_per_slice
        self.hashes = generate_hashfunctions(self.bits_per_slice, self.nbr_slices)

    def ensure_cassandra_cf(self):
        s = SystemManager()
        if self.keyspace not in s.list_keyspaces():
            s.create_keyspace(self.keyspace, SIMPLE_STRATEGY, {'replication_factor': '1'})
        if self.cassandra_columns_family not in s.get_keyspace_column_families(self.keyspace):
            s.create_column_family(self.keyspace, self.cassandra_columns_family)
        self.columnfamily = ColumnFamily(self.cassandra_session, self.cassandra_columns_family)

    def archive_bf_key(self, bf_key):
        self.uncommited_keys.append(bf_key)
        if len(self.uncommited_keys) >= self.commit_batch:
            current_period_hour = dt.datetime.now().strftime('%Y-%m-%d:%H')
            self.columnfamily.insert('%s_%s' % (self.name, current_period_hour), {k:'' for k in self.uncommited_keys})
            self.uncommited_keys = []

    def _hour_range(self, start, end, reverse=False, inclusive=True):
        """Generator that gives us all the hours between a start and end datetime
        (inclusive)."""

        def total_seconds(td):
            return (td.microseconds + (td.seconds + td.days * 24.0 * 3600.0) * 10.0**6) / 10.0**6

        hours = int(math.ceil(total_seconds(end - start) / (60.0 * 60.0)))
        if inclusive:
            hours += 1
        for i in xrange(hours):
            if reverse:
                yield end - dt.timedelta(hours=i)
            else:
                yield start + dt.timedelta(hours=i)

    def resize(self, new_capacity):
        self.capacity = new_capacity
        self._initialize_parameters()
        self.rebuild_from_archive()

    def _drop_archive(self):
        last_period = self.current_period - dt.timedelta(days=self.expiration-1)
        hours = self._hour_range(last_period, dt.datetime.now())
        for hour in hours:
            try:
                row = "%s_%s" % (self.name, hour.strftime('%Y-%m-%d:%H'))
                nbr_keys = self.columnfamily.get_count(row)
                keys = self.columnfamily.remove(row)
            except:
                pass

    def rebuild_from_archive(self):
        """Rebuild the BF using the archived items"""
        self.initialize_bitarray()
        last_period = self.current_period - dt.timedelta(days=self.expiration-1)
        hours = self._hour_range(last_period, dt.datetime.now())
        rows = []
        for i,hour in enumerate(hours):
            row = "%s_%s" % (self.name, hour.strftime('%Y-%m-%d:%H'))
            rows.append(row)
        rows_content = self.columnfamily.multiget(rows, column_count=1E6)

        for row_content in rows_content.values():
            for k in row_content.keys():
                self.add(k, rebuild_mode=True)

    def initialize_bitarray(self):
        """Initialize both bitarray.

        This BF contain two bit arrays instead of single one like a plain BF. bitarray
        is the main bit array where all the historical items are stored. It's the one
        used for the membership query. The second one, current_day_bitarray is the one
        used for creating the daily snapshot.
        """
        self.bitarray = bitarray.bitarray(self.nbr_bits)
        self.current_day_bitarray = bitarray.bitarray(self.nbr_bits)
        self.bitarray.setall(False)
        self.current_day_bitarray.setall(False)

    def __contains__(self, key):
        """Check membership."""
        self.hashed_values = self.hashes(key)
        offset = 0
        for value in self.hashed_values:
            if not self.bitarray[offset + value]:
                return False
            offset += self.bits_per_slice
        return True

    def add(self, key, rebuild_mode=False):
        if not rebuild_mode:
            self.archive_bf_key(key)
        if key in self:
            return True
        offset = 0
        if not self.hashed_values:
            self.hashed_values = self.hashes(key)
        for value in self.hashed_values:
            self.bitarray[offset + value] = True
            self.current_day_bitarray[offset + value] = True
            offset += self.bits_per_slice
        self.count += 1
        return False

    def initialize_period(self, period=None):
        """Initialize the period of BF.

        :period: datetime.datetime for setting the period explicity.
        """
        if not period:
            self.current_period = dt.datetime.now()
        else:
            self.current_period = period
        self.current_period = dt.datetime(self.current_period.year, self.current_period.month, self.current_period.day)
        self.date = self.current_period.strftime("%Y-%m-%d")

    def maintenance(self):
        """Expire the old element of the set.

        Initialize a new bitarray and load the previous snapshot. Execute this guy
        at the beginining of each day.
        """
        self.initialize_period()
        self.initialize_bitarray()
        self.restore_from_disk()

    def compute_refresh_period(self):
        self.warm_period =  (60 * 60 * 24) // (self.expiration-2)

    def _should_warm(self):
        return time.time() >= self.next_snapshot_load

    def warm(self, jittering_ratio=0.2):
        """Progressively load the previous snapshot during the day.

        Loading all the snapshots at once can takes a substantial amount of time. This method, if called
        periodically during the day will progressively load those snapshots one by one. Because many workers are
        going to use this method at the same time, we add a jittering to the period between load to avoid
        hammering the disk at the same time.
        """
        if self.snapshot_to_load == None:
            last_period = self.current_period - dt.timedelta(days=self.expiration-1)
            self.compute_refresh_period()
            self.snapshot_to_load = []
            base_filename = "%s/%s_%s_*.dat" % (self.snapshot_path, self.name, self.expiration)
            availables_snapshots = glob.glob(base_filename)
            for filename in availables_snapshots:
                snapshot_period = dt.datetime.strptime(filename.split('_')[-1].strip('.dat'), "%Y-%m-%d")
                if snapshot_period >= last_period:
                    self.snapshot_to_load.append(filename)
                    self.ready = False

        if self.snapshot_to_load and self._should_warm():
            filename = self.snapshot_to_load.pop()
            self._union_bf_from_file(filename)
            jittering = self.warm_period * (np.random.random()-0.5) * jittering_ratio
            self.next_snapshot_load = time.time() + self.warm_period + jittering
            if not self.snapshot_to_load:
                self.ready = True


    def _union_bf_from_file(self, filename, current=False):
        snapshot = cPickle.loads(zlib.decompress(open(filename,'r').read()))
        if current:
            self.current_day_bitarray = self.current_day_bitarray | snapshot
        else:
            self.bitarray = self.bitarray | snapshot

    def restore_from_disk(self, clean_old_snapshot=False):
        """Restore the state of the BF using previous snapshots.

        :clean_old_snapshot: Delete the old snapshot on the disk (period < current - expiration)
        """
        base_filename = "%s/%s_%s_*.dat" % (self.snapshot_path, self.name, self.expiration)
        availables_snapshots = glob.glob(base_filename)
        last_period = self.current_period - dt.timedelta(days=self.expiration-1)
        for filename in availables_snapshots:
            snapshot_period = dt.datetime.strptime(filename.split('_')[-1].strip('.dat'), "%Y-%m-%d")
            if snapshot_period <  last_period and not clean_old_snapshot:
                continue
            else:
                self._union_bf_from_file(filename)
                if snapshot_period == self.current_period:
                    self._union_bf_from_file(filename, current=True)

            if snapshot_period < last_period and clean_old_snapshot:
                os.remove(filename)
        self.ready = True

    def save_snaphot(self):
        """Save the current state of the current day bitarray on disk.

        Save the internal representation (bitarray) into a binary file using this format:
            filename : name_expiration_2013-01-01.dat
        """
        filename = "%s/%s_%s_%s.dat" % (self.snapshot_path, self.name, self.expiration, self.date)
        with open(filename, 'w') as f:
            f.write(zlib.compress(cPickle.dumps(self.current_day_bitarray, protocol=cPickle.HIGHEST_PROTOCOL)))

    def union_current_day(self, bf):
        """Union only the current_day of an other BF."""
        self.bitarray = self.bitarray | bf.current_day_bitarray
Exemplo n.º 19
0
class DailyTemporalBloomFilter(DailyTemporalBase):
    """Long Range Temporal BloomFilter using a daily resolution.

    For really high value of expiration (like 60 days) with low requirement on precision.
    The actual error of this BF will the be native error of the BF + the error related
    to the coarse aspect of the expiration, since we no longer expires information precisely.
    Also, as opposed to a classic Bloom Filter, this one will aslo have false positive (reporting membership for a non-member)
    AND false negative (reporting non-membership for a member).

    The upper bound of the temporal_error can be theoricaly quite high. However, if the
    items of the set are uniformly distributed over time, the avg error will be something like 1.0 / expiration
    """

    def __new__(cls, capacity, error_rate, expiration, name, cassandra_session, snapshot_path='./'):
        return super(DailyTemporalBloomFilter, cls).__new__(cls, capacity=capacity, error_rate=error_rate)

    def __init__(self, capacity, error_rate, expiration, name, cassandra_session, snapshot_path='./'):
        filename = ""
        super(DailyTemporalBloomFilter, self).__init__(capacity=capacity, error_rate=error_rate)
        self.bf_name = name
        self.expiration = expiration
        self.initialize_period()
        self.cassandra_session = cassandra_session
        self.cassandra_columns_family = "temporal_bf"
        self.keyspace = 'parsely'
        self.uncommited_keys = []
        self.commit_batch = 1000
        self.columnfamily = None
        self.ensure_cassandra_cf()
        self.snapshot_path = snapshot_path

    def ensure_cassandra_cf(self):
        s = SystemManager()
        if self.keyspace not in s.list_keyspaces():
            s.create_keyspace(self.keyspace, SIMPLE_STRATEGY, {'replication_factor': '1'})
        if self.cassandra_columns_family not in s.get_keyspace_column_families(self.keyspace):
            s.create_column_family(self.keyspace, self.cassandra_columns_family)
        self.columnfamily = ColumnFamily(self.cassandra_session, self.cassandra_columns_family)

    def archive_bf_key(self, bf_key):
        self.uncommited_keys.append(bf_key)
        if len(self.uncommited_keys) >= self.commit_batch:
            current_period_hour = dt.datetime.now().strftime('%Y-%m-%d:%H')
            self.columnfamily.insert('%s_%s' % (self.bf_name, current_period_hour), {k:'' for k in self.uncommited_keys})
            self.uncommited_keys = []

    def _hour_range(self, start, end, reverse=False, inclusive=True):
        """Generator that gives us all the hours between a start and end datetime
        (inclusive)."""

        def total_seconds(td):
            return (td.microseconds + (td.seconds + td.days * 24.0 * 3600.0) * 10.0**6) / 10.0**6

        hours = int(math.ceil(total_seconds(end - start) / (60.0 * 60.0)))
        if inclusive:
            hours += 1
        for i in xrange(hours):
            if reverse:
                yield end - dt.timedelta(hours=i)
            else:
                yield start + dt.timedelta(hours=i)

    def _day_range(self, start, end, reverse=False, inclusive=True):
        """Generator that gives us all the days between a start and end datetime
        (inclusive)."""
        days = (end - start).days
        if inclusive:
            days += 1
        for i in xrange(days):
            if reverse:
                yield end - dt.timedelta(days=i)
            else:
                yield start + dt.timedelta(days=i)

    def _drop_archive(self):
        last_period = self.current_period - dt.timedelta(days=self.expiration-1)
        hours = self._hour_range(last_period, dt.datetime.now())
        for hour in hours:
            try:
                row = "%s_%s" % (self.bf_name, hour.strftime('%Y-%m-%d:%H'))
                nbr_keys = self.columnfamily.get_count(row)
                keys = self.columnfamily.remove(row)
            except:
                pass

    def rebuild_from_archive(self, rebuild_snapshot=True):
        """Rebuild the BF using the archived items"""
        self.initialize_bitarray()

        #if rebuild_snapshot:
        #    self.delete_snapshots()

        def multi_rows_itr(rows):
            for row in rows.values():
                for k in row.keys():
                    yield k

        last_period = self.current_period - dt.timedelta(days=self.expiration-1)
        hours = self._hour_range(last_period, dt.datetime.now())
        days = self._day_range(last_period, dt.datetime.now())
        rows = []
        for i,day in enumerate(days):
            rows = ["%s_%s:%s" % (self.bf_name, day.strftime('%Y-%m-%d'), hour_str) for hour_str in ["%02d" % i for i in range(24)]]
            rows_content = self.columnfamily.multiget(rows, column_count=1E6)
            update_current = day == self.current_period

            for k in multi_rows_itr(rows_content):
                self.add_rebuild(k, update_current)

            if rebuild_snapshot:
                self.save_snaphot(override_period=day)

            if not update_current:
                self.initialize_current_day_bitarray()

    def restore_from_disk(self, clean_old_snapshot=False):
        """Restore the state of the BF using previous snapshots.

        :clean_old_snapshot: Delete the old snapshot on the disk (period < current - expiration)
        """
        base_filename = "%s/%s_%s_*.dat" % (self.snapshot_path, self.bf_name, self.expiration)
        availables_snapshots = glob.glob(base_filename)
        last_period = self.current_period - dt.timedelta(days=self.expiration-1)
        for filename in availables_snapshots:
            snapshot_period = dt.datetime.strptime(filename.split('_')[-1].strip('.dat'), "%Y-%m-%d")
            if snapshot_period <  last_period and not clean_old_snapshot:
                continue
            else:
                self._union_bf_from_file(filename)
                if snapshot_period == self.current_period:
                    self._union_bf_from_file(filename, current=True)

            if snapshot_period < last_period and clean_old_snapshot:
                os.remove(filename)
        self.ready = True

    def add_rebuild(self, key, update_current=True):
        super(DailyTemporalBloomFilter, self).add(key, update_current)

    def add(self, key_string):
        if isinstance(key_string, unicode):
            key = key_string.encode('utf8')
        else:
            key = key_string

        self.archive_bf_key(key)
        result = super(DailyTemporalBloomFilter, self).add(key)

        return result

    def resize(self, new_capacity=None, new_error_rate=None):
        self._set_capacity(new_capacity or self.capacity)
        self._set_error_rate(new_error_rate or self.error_rate)
        self._initialize_parameters()
        self.initialize_bitarray()
        self.rebuild_from_archive(rebuild_snapshot=True)

    def initialize_period(self, period=None):
        """Initialize the period of BF.

        :period: datetime.datetime for setting the period explicity.
        """
        if not period:
            self.current_period = dt.datetime.now()
        else:
            self.current_period = period
        self.current_period = dt.datetime(self.current_period.year, self.current_period.month, self.current_period.day)
        self.date = self.current_period.strftime("%Y-%m-%d")

    def save_snaphot(self, override_period=None):
        """Save the current state of the current day bitarray on disk.

        Save the internal representation (bitarray) into a binary file using this format:
            filename : name_expiration_2013-01-01.dat
        """
        period = override_period or self.current_period
        filename = "%s/%s_%s_%s.dat" % (self.snapshot_path, self.bf_name, self.expiration, period.strftime("%Y-%m-%d"))
        self._save_snapshot(filename)
Exemplo n.º 20
0
    from pycassa.types import *
    col_fam.column_validators['IntColumn5'] = IntegerType()
    col_fam.column_validators['IntColumn6'] = IntegerType()    
    col_fam.insert('intData', {'IntColumn5':5, 'IntColumn6':6})
    print col_fam.get('intData')
    # OrderedDict([('IntColumn5', 5), ('IntColumn6', 6)])
    
#Batch operations

    col_fam.batch_insert({'key4': {'Column1': 'PycassaData4', 
                                   'Column2': 'PycassaData5',
                                   'Column3': 'PycassaData6',
                                   'Column4': 'PycassaData7',
                                   'Column5': 'PycassaData8'},
                          'key5': {'Column7': 'PycassaData9'}})
    readData = col_fam.multiget(['key3', 'key4', 'key5'])
    readData = col_fam.multiget(['key3', 'key4', 'key5'], columns=['Column1', 'Column7'])

#Column Slices

    readData = col_fam.get('key4', column_start='Column2', column_finish='Column4')
    readData = col_fam.get('key4', column_reversed=True, column_count=3)    

#Types

#     from pycassa.types import *
#     class User(object):
#          key = AsciiType()
#          name = UTF8Type()
#          age = IntegerType()
#          height = FloatType()