Exemplo n.º 1
0
def read(metric, start_time, end_time, tags):
    pool = ConnectionPool(keyspace, [address])
#    decide which column family to read based on time diffrence
    if timeDiff(start_time, end_time) <= 3600:
        col_fam = pycassa.ColumnFamily(pool, 'rawdata')
    elif timeDiff(start_time, end_time) <= 7200:
        col_fam = pycassa.ColumnFamily(pool, 'rollups60')
    elif timeDiff(start_time, end_time) <= 86400:
        col_fam = pycassa.ColumnFamily(pool, 'rollups300')
    elif timeDiff(start_time, end_time) <= 2592000:
        col_fam = pycassa.ColumnFamily(pool, 'rollups7200')
    else:
        col_fam = pycassa.ColumnFamily(pool, 'rollups86400') 
        
#  change start_time , end_time to uper timestamp
    start_upertime = start_time/upertime_interval
    end_updertime = end_time/upertime_interval
    points = {}
    for i in range(start_upertime, end_updertime + 1):
        key = generate_key(metric, i, tags)
        try:
            points = col_fam.get(key, column_start=start_time, column_finish=end_time)
        except pycassa.NotFoundException:
            return None
    pool.dispose()
    return points
Exemplo n.º 2
0
def get_cassandra_connection(keyspace_name, hosts):
    key = keyspace_name, tuple(hosts)
    connection_pool, created_at = connection_pool_cache.get(key, (None, None))

    init_new_pool = connection_pool is None or connection_pool_expired(
        created_at)

    if connection_pool is not None and len(connection_pool.server_list) == 0:
        logging.error('connection pool had no active hosts')
        init_new_pool = True

    if init_new_pool:
        nodes = detect_nodes(hosts, keyspace_name)
        logger.info('setting up a new connection pool')
        connection_pool = ConnectionPool(
            keyspace_name,
            nodes,
            pool_size=settings.FEEDLY_CASSANDRA_CONNECTION_POOL_SIZE,
            prefill=False,
            timeout=settings.FEEDLY_CASSANDRA_TIMEOUT,
            max_retries=3
        )
        listener = FeedlyPoolListener(connection_pool)
        connection_pool.add_listener(listener)
        connection_pool_cache[key] = (connection_pool, time.time())
    return connection_pool
def main(filename):
    data =  file(filename)
    # Set up the connection pool
    pool = ConnectionPool('tuitterdb',['localhost:9160'])
    
    # CF connections
    user_family = ColumnFamily(pool, 'user')
    tweet_family = ColumnFamily(pool, 'tweet')
    user_tweets_family = ColumnFamily(pool, 'userTweets')
    followers = ColumnFamily(pool, 'followers')
    followerTweets = ColumnFamily(pool, 'followsTweets')

    # Batch Definitions
    user_batch = user_family.batch(queue_size=1000)
    followers_batch = followers.batch(queue_size=500)
    user_tweets_batch = user_tweets_family.batch(queue_size = 500)
    followerTweets_batch = followerTweets.batch(queue_size = 500)
    
                           
    while True: # loop
        line = data.readline()
        if line == "": break # This isn't handled properly
        else:
            tweet = tweet_get(line)
            try:
                tweet_data = get_tweet_data(tweet)
                if check_user(tweet[u"from_user_id_str"]) == False: # check in script if user is there.
                    sender = get_sender(tweet)
                    user_batch.insert(sender.user_id,{'user_name':sender.user_name, 'screen_name':sender.from_user}) # create user entry for sender
                    user_tweets_batch.insert(sender.user_id,{line:''}) # insert the whole tweet into a userTweets column header

                if tweet[u"to_user"] is not None and check_user(tweet[u"to_user_id"]) == False:
                    to_user = get_to_user(tweet)
                    user_batch.insert(to_user.user_id,{'user_name':to_user.user_name, 'screen_name':to_user.from_user})
                    followers_batch.insert(to_user.user_id,{sender.user_id:'follower_id'}) 
                    followerTweets_batch.insert(to_user.user_id,{line:''}) # insert the whole tweet into a followeTweets column header for the to user.


                if u"entities" in tweet: # iterate over the users in mentions and add them to users and follows if necessary
                    if tweet[u"entities"][u"user_mentions"] != []:
                        user_mentions = get_mentions(tweet)
                        for obj in user_mentions:
                            if check_user(obj.user_id) == False:
                                user_batch.insert(obj.user_id,{'user_name':obj.user_name, 'screen_name':obj.from_user})
                            followers_batch.insert(obj.user_id,{'user_id':sender.user_id})
                            followerTweets_batch.insert(obj.user_id,{line:''}) # insert the whole tweet to a followerTweet entry for the mentioned user
                else:
                    continue

                tweet_family.insert(tweet_data.tweet_id,{'text':tweet_data.textbody,'user_id':sender.user_id,'timeanddate':tweet_data.timestamp})

            except Exception:
                err = sys.exc_info()
                print "Broken cos %s %s %s" % (err[0],err[1], traceback.print_tb(err[2])) #print the exception data with traceback and continue.
                continue

    # Pools Closed.
    pool.dispose()
def main(filename):
    data =  file(filename)
    # Set up the connection pool
    pool = ConnectionPool('tuitterdb',['localhost:9160'])
    # CF connections
    user_family = ColumnFamily(pool, 'user')
    tweet_family = ColumnFamily(pool, 'tweet')
    user_tweets_family = ColumnFamily(pool, 'userTweets')
    #follows_tweets_family = ColumnFamily(pool, 'follows.tweets')
    followers = ColumnFamily(pool, 'followers')

    # Batch Definitions
    user_batch = user_family.batch(queue_size=1000)
    followers_batch = followers.batch(queue_size=500)
    user_tweets_batch = user_tweets_family.batch(queue_size = 500)
    
                           
    while True:
        line = data.readline()
        if line is None: break
        else:
            tweet = tweet_get(line)
            try:
                tweet_data = get_tweet_data(tweet)
                if check_user(tweet[u"from_user_id_str"]) == False:
                    sender = get_sender(tweet)
                    user_batch.insert(sender.user_id,{'user_name':sender.user_name, 'screen_name':sender.from_user})
                    user_tweets_batch.insert(sender.user_id,{tweet_data.tweet_id:tweet_data.timestamp})

                if tweet[u"to_user"] is not None and check_user(tweet[u"to_user_id"]) == False:
                    to_user = get_to_user(tweet)
                    user_batch.insert(to_user.user_id,{'user_name':to_user.user_name, 'screen_name':to_user.from_user})
                    followers_batch.insert(to_user.user_id,{'user_id':sender.user_id})


                if u"entities" in tweet:
                    if tweet[u"entities"][u"user_mentions"] != []:
                        user_mentions = get_mentions(tweet)
                        for obj in user_mentions:
                            user_batch.insert(obj.user_id,{'user_name':obj.user_name, 'screen_name':obj.from_user})
                            followers_batch.insert(obj.user_id,{'user_id':sender.user_id})
                else:
                    continue

                tweet_family.insert(tweet_data.tweet_id,{'text':tweet_data.textbody,'user_id':sender.user_id,'timeanddate':tweet_data.timestamp})

            except Exception:
                err = sys.exc_info()
                print "Broken cos %s %s %s" % (err[0],err[1], traceback.print_tb(err[2]))
                continue

    # Pools Closed.
    pool.dispose()

#if __name__ == "__main__":
    #unittest.main()
Exemplo n.º 5
0
def test_big_batched_writes():
    ## this is an m1.xlarge doing nothing but supporting this test
    server = 'localhost:9160'
    keyspace = 'testkeyspace_' + getpass.getuser().replace('-', '_')
    family = 'testcf'
    sm = SystemManager(server)
    try:
        sm.drop_keyspace(keyspace)
    except pycassa.InvalidRequestException:
        pass
    sm.create_keyspace(keyspace, SIMPLE_STRATEGY, {'replication_factor': '1'})
    sm.create_column_family(keyspace, family, super=False,
                            key_validation_class = LEXICAL_UUID_TYPE,
                            default_validation_class  = LEXICAL_UUID_TYPE,
                            column_name_class = ASCII_TYPE)
    sm.alter_column(keyspace, family, 'test', ASCII_TYPE)
    sm.close()

    pool = ConnectionPool(keyspace, [server], max_retries=10, pool_timeout=0, pool_size=10, timeout=120)
    pool.fill()
    pool.add_listener( Listener() )

    ## assert that we are using framed transport
    conn = pool._q.get()
    assert isinstance(conn.transport, thrift.transport.TTransport.TFramedTransport)
    pool._q.put(conn)

    try:
        for num_rows in range(14, 20):
            ## write some data to cassandra using increasing data sizes
            one_mb = ' ' * 2**20
            rows = []
            for i in xrange(num_rows):
                key = uuid.uuid4()
                rows.append((key, dict(test=one_mb)))

            testcf = pycassa.ColumnFamily(pool, family)
            with testcf.batch() as batch:
                for (key, data_dict) in rows:
                    data_size = len(data_dict.values()[0])
                    logger.critical('adding %r with %.6f MB' % (key, float(data_size)/2**20))
                    batch.insert(key, data_dict)

            logger.critical('%d rows written' % num_rows)

    finally:
        sm = SystemManager(server)
        try:
            sm.drop_keyspace(keyspace)
        except pycassa.InvalidRequestException:
            pass
        sm.close()
        logger.critical('clearing test keyspace: %r' % keyspace)
    def __init__(self, namespace, server_list=['localhost:9160']):
        # save cassandra server
        self.server_list = server_list
        self.namespace = namespace
        self._closed = False

        #setup_logging(self)

        # Connect to the server creating the namespace if it doesn't
        # already exist
        try:
            self.pool = ConnectionPool(namespace, self.server_list, max_retries=500, pool_timeout=600, timeout=10)
        except pycassa.InvalidRequestException:
            self._create_namespace(namespace)
            self.pool = ConnectionPool(namespace, self.server_list, max_retries=500, pool_timeout=600, timeout=10)

        try:
            self._tasks = pycassa.ColumnFamily(self.pool, 'tasks')
        except pycassa.NotFoundException:
            self._create_column_family('tasks', 
                                       key_validation_class=ASCII_TYPE, 
                                       bytes_columns=['task_data'])
            self._tasks = pycassa.ColumnFamily(self.pool, 'tasks')

        try:
            self._available = pycassa.ColumnFamily(self.pool, 'available')
        except pycassa.NotFoundException:
            self._create_column_family('available', 
                                        key_validation_class=ASCII_TYPE, 
                                        bytes_columns=['available'])
            self._available = pycassa.ColumnFamily(self.pool, 'available')

        try:
            self._task_count = pycassa.ColumnFamily(self.pool, 'task_count')
        except pycassa.NotFoundException:
            self._create_counter_column_family('task_count', 
                                       key_validation_class=ASCII_TYPE, 
                                       counter_columns=['task_count'])
            self._task_count = pycassa.ColumnFamily(self.pool, 'task_count')
            self._task_count.insert('RowKey', {'task_count': 0})

        try:
            self._available_count = pycassa.ColumnFamily(self.pool, 'available_count')
        except pycassa.NotFoundException:
            self._create_counter_column_family('available_count', 
                                       key_validation_class=ASCII_TYPE, 
                                       counter_columns=['available_count'])
            self._available_count = pycassa.ColumnFamily(self.pool, 'available_count')
            self._available_count.insert('RowKey', {'available_count': 0})
Exemplo n.º 7
0
def get_values(servlst, ks, cf, key):
    #print key
    try:
        pool = ConnectionPool(ks, servlst)
        cf_handle = ColumnFamily(pool, cf)
        result = cf_handle.get(key).values()
    except pycassa.NotFoundException as err:
        print "[ERROR] " + key + " not found"
        result = ""
    except Exception as err:
        print "[ERROR] " + str(err)
        exit(-1)
    finally:
        pool.dispose()

    return result
def write(vl, data=None):
#    get connection from pycassa connection pool 
#    创建keyspace Rawdata保存监控数据
    pool = ConnectionPool('Monitor', ['localhost:9160'])
#   创建columnFamily:RawData, Rollups60, Rollups300, Rollups7200, Rollups86400
    col_fam_rawdata = pycassa.ColumnFamily(pool, 'rawdata')    
    
#   加入时间进行分区,以月作为分区
    timeString = time.strftime("%Y-%m", time.localtime(vl.time))
    key = [str(vl.host), str(vl.plugin), str(vl.plugin_instance), str(vl.type), str(vl.type_instance), timeString]
    keyString = "#".join(key)
    
    
    for i in vl.values:
        col_fam_rawdata.insert(keyString, {vl.time: i})     #插入RawData
        

#   同时写入一个文件作为测试
    with open('/tmp/workfile', 'a') as f:
        f.write(keyString + " " + str(vl.time) + " " + str(i) + "\n")
     
    pool.dispose()
    f.close()
 def _update_analytics_start_time(self, start_time):
     pool = ConnectionPool(COLLECTOR_KEYSPACE, ['127.0.0.1:%s' \
                           % (self.__class__.cassandra_port)])
     col_family = ColumnFamily(pool, SYSTEM_OBJECT_TABLE)
     col_family.insert(SYSTEM_OBJECT_ANALYTICS,
                       {SYSTEM_OBJECT_START_TIME: start_time})
Exemplo n.º 10
0
def write(metric, timestamp, value, tags, ds_type):
    try:
        value = normalize_value(metric, tags, value, timestamp, ds_type)
    except ValueError:
        return

    pool = ConnectionPool(keyspace, [address])
    
    upertime = timestamp/upertime_interval
#    get key from database, if some id is not exist, create new one
    key = generate_key(metric, upertime, tags) 
    
#    save to rawdata
    pool = ConnectionPool(keyspace, [address])
    col_fam_rawdata = pycassa.ColumnFamily(pool, 'rawdata') 
    col_fam_rawdata.insert(key, {timestamp: value})  
    
#   save to rollups60,if in the same minute , update the memory. 
#   if it is new minute, write the old value to cassandra, update the memory
    if dictAvg60[metric]['timestamp'] == 0:
        dictAvg60[metric]['avg'] = value
        dictAvg60[metric]['counter'] = 1
    elif inOneMinute(timestamp, dictAvg60[metric]['timestamp']):
        newAvg  = caculate(dictAvg60[metric]['avg'], dictAvg60[metric]['counter'], value)
        dictAvg60[metric]['avg'] = newAvg
        dictAvg60[metric]['counter'] += 1
    else:
        col_fam_rollups60 = pycassa.ColumnFamily(pool, 'rollups60')
        col_fam_rollups60.insert(metric, {dictAvg60[metric]['timestamp']:  dictAvg60[key]['avg']})  
        dictAvg60[metric]['avg'] = value
        dictAvg60[metric]['counter'] = 1
    dictAvg60[metric]['timestamp'] = timestamp
    
 #   save to rollups300
    if dictAvg300[metric]['timestamp'] == 0:
        dictAvg300[metric]['avg'] = value
        dictAvg300[metric]['counter'] = 1
    elif inFiveMinutes(timestamp, dictAvg300[metric]['timestamp']):
        newAvg  = caculate(dictAvg300[metric]['avg'], dictAvg300[metric]['counter'], value)
        dictAvg300[metric]['avg'] = newAvg
        dictAvg300[metric]['counter'] += 1
    else:
        col_fam_rollups300 = pycassa.ColumnFamily(pool, 'rollups300')
        col_fam_rollups300.insert(metric, {dictAvg300[metric]['timestamp']:  dictAvg300[key]['avg']})  
        dictAvg300[metric]['avg'] = value
        dictAvg300[metric]['counter'] = 1
    dictAvg300[metric]['timestamp'] = timestamp
    
#   save to rollups7200
    if dictAvg7200[metric]['timestamp'] == 0:
        dictAvg7200[metric]['avg'] = value
        dictAvg7200[metric]['counter'] = 1
    elif inTwoHours(timestamp, dictAvg7200[metric]['timestamp']):
        newAvg  = caculate(dictAvg7200[metric]['avg'], dictAvg7200[metric]['counter'], value)
        dictAvg7200[metric]['avg'] = newAvg
        dictAvg7200[metric]['counter'] += 1
    else:
        col_fam_rollups7200 = pycassa.ColumnFamily(pool, 'rollups7200')
        col_fam_rollups7200.insert(metric, {dictAvg7200[metric]['timestamp']:  dictAvg7200[key]['avg']})  
        dictAvg7200[metric]['avg'] = value
        dictAvg7200[metric]['counter'] = 1
    dictAvg7200[metric]['timestamp'] = timestamp
    
#   save to rollups86400
    if dictAvg86400[metric]['timestamp'] == 0:
        dictAvg86400[metric]['avg'] = value
        dictAvg86400[metric]['counter'] = 1
    elif inOneDay(timestamp, dictAvg86400[metric]['timestamp']):
        newAvg  = caculate(dictAvg86400[metric]['avg'], dictAvg86400[metric]['counter'], value)
        dictAvg86400[metric]['avg'] = newAvg
        dictAvg86400[metric]['counter'] += 1
    else:
        col_fam_rollups86400 = pycassa.ColumnFamily(pool, 'rollups86400')
        col_fam_rollups86400.insert(metric, {dictAvg86400[metric]['timestamp']:  dictAvg86400[key]['avg']})  
        dictAvg86400[metric]['avg'] = value
        dictAvg86400[metric]['counter'] = 1
    dictAvg86400[metric]['timestamp'] = timestamp
    pool.dispose();
Exemplo n.º 11
0
# UNABLE TO CONNECT

import csv
from pycassa.pool import ConnectionPool
from pycassa.columnfamily import ColumnFamily

pool = ConnectionPool('test', ['127.0.0.1:9042'])
cf = ColumnFamily(pool, "testtable")

with open('test.csv', 'rb') as csvfile:
  reader = csv.DictReader(csvfile)
  for row in reader:
    print str(row)
    key = row['id']
    del row['id']
    cf.insert(key, row)

pool.dispose()

# TO RUN
# $ python
# python shell > python seedCassandra.py
Exemplo n.º 12
0
from pycassa.types import *
from pycassa.system_manager import *
from pycassa.pool import ConnectionPool
from pycassa.columnfamily import ColumnFamily


def create_ks():
    # create test keyspace
    sys = SystemManager()
    comparator = CompositeType(LongType(), BytesType())
    sys.create_column_family("testing", "testing", comparator_type=comparator)


pool = ConnectionPool('testing')
cf = ColumnFamily(pool, 'testing')

# Check the column added by the Haskell test script
# print [k for k in cf.get_range()]
# cf.insert("row2", {(125, 'oklahoma'): 'asdf'})

print cf.get('row1')
print cf.get('row2')
# should see: OrderedDict([((125, 'oklahoma'), 'asdf')])
Exemplo n.º 13
0
import pycassa
from pycassa.pool import ConnectionPool
from pycassa import index
from pycassa.columnfamily import ColumnFamily

pool1 = ConnectionPool('MINDNET', ['localhost:9160'], timeout=10000000)
pool2 = ConnectionPool('MINDNET', ['213.136.81.102:9160'], timeout=10000000)


def migr(tab1, tab2, tb):
    #r1=tab1.get_range()
    #tab2.truncate()
    ind = 0
    while True:
        cach = []
        r1 = tab1.get_range()
        for ky, col in r1:
            cach.append([ky, col])
            if len(cach) % 1000 == 0:
                print 'collect(', tb, '):', len(cach)
            if len(cach) >= 500000:
                break
        if len(cach) == 0: break

        b1 = tab2.batch(55000)
        b2 = tab1.batch(55000)
        indc = 0
        for ky, col in cach:
            tab2.insert(ky, col)
            tab1.remove(ky)
            indc += 1
Exemplo n.º 14
0
import json

from twitter import settings
from pycassa.pool import ConnectionPool
from pycassa.columnfamily import ColumnFamily
from pycassa.cassandra.ttypes import NotFoundException

from twitter.backend.base import TimelineFile as BaseTimelineFile
from twitter.backend.base import FollowerFile as BaseFollowerFile

print "Initializing connnection pool..."

POOL = ConnectionPool(settings.CASSANDRA_KEYSPACE,
                      settings.CASSANDRA_POOL,
                      timeout=2)
FOLLOWERS = ColumnFamily(POOL, 'Followers')
USERTIMELINE = ColumnFamily(POOL, 'UserTimeline')
TIMELINE = ColumnFamily(POOL, 'Timeline')
COUNTERS = ColumnFamily(POOL, 'Counters')


class TimelineFile(BaseTimelineFile):
    def __init__(self, user_id):
        BaseTimelineFile.__init__(self, user_id)

    def get_first(self):
        dct = USERTIMELINE.get(self.user_id,
                               column_count=1,
                               column_reversed=True)

        tweet_id = dct[dct.keys()[0]]
Exemplo n.º 15
0
from bs4 import BeautifulSoup
import datetime
import hashlib
import numpy as np
import pandas as pd
import csv
import re
import os

from pycassa.pool import ConnectionPool
from pycassa.columnfamily import ColumnFamily
pool = ConnectionPool(
    'dnm',
    ['158.85.217.74:9160'])  #Needs to be in the format of '169.53.141.8:9160'
cf = ColumnFamily(pool, 'products')

wdir = "/sandisk1/darknetmarket/silkroad2"

#Ensure the current directory is correctly set
os.chdir(wdir)

c = 0  #Counter
non_decimal = re.compile(r'[^\d.]+')  #Clean strings with numbers

#####CODE TO READ IN CSV OF BITCOIN PRICES GOES HERE#####
bitcoin = pd.read_csv('/sandisk1/darknetmarket/Bitcoin Prices.csv',
                      sep=',')  #Reads in historical bitcoin prices
bitcoin['Date'] = pd.to_datetime(
    bitcoin['Date'])  #Converts 'Date' field to Datetime
#########################################################
Exemplo n.º 16
0
import time

from pycassa.pool import ConnectionPool
from pycassa.columnfamily import ColumnFamily
from pycassa.cassandra.ttypes import NotFoundException

__all__ = ['get_user_by_username', 'get_friend_usernames',
    'get_follower_usernames', 'get_users_for_usernames', 'get_friends',
    'get_followers', 'get_timeline', 'get_userline', 'get_tweet', 'save_user',
    'save_tweet', 'add_friends', 'remove_friends', 'DatabaseError',
    'NotFound', 'InvalidDictionary', 'PUBLIC_USERLINE_KEY']

POOL = ConnectionPool('Twissandra')

USER = ColumnFamily(POOL, 'User')
FRIENDS = ColumnFamily(POOL, 'Friends')
FOLLOWERS = ColumnFamily(POOL, 'Followers')
TWEET = ColumnFamily(POOL, 'Tweet')
TIMELINE = ColumnFamily(POOL, 'Timeline')
USERLINE = ColumnFamily(POOL, 'Userline')

# NOTE: Having a single userline key to store all of the public tweets is not
#       scalable.  Currently, Cassandra requires that an entire row (meaning
#       every column under a given key) to be able to fit in memory.  You can
#       imagine that after a while, the entire public timeline would exceed
#       available memory.
#
#       The fix for this is to partition the timeline by time, so we could use
#       a key like !PUBLIC!2010-04-01 to partition it per day.  We could drill
#       down even further into hourly keys, etc.  Since this is a demonstration
#       and that would add quite a bit of extra code, this excercise is left to
Exemplo n.º 17
0
import time

from pycassa.pool import ConnectionPool
from pycassa.columnfamily import ColumnFamily
from pycassa.cassandra.ttypes import NotFoundException

__all__ = ['get_user_by_userid', 'DatabaseError',
    'NotFound', 'InvalidDictionary']

POOL = ConnectionPool(keyspace='TEST', server_list=['localhost:9160'], prefill=False)

USER = ColumnFamily(POOL, 'Users')
Board = ColumnFamily(POOL, 'Board')

class DatabaseError(Exception):
    """
    The base error that functions in this module will raise when things go
    wrong.
    """
    pass

class NotFound(DatabaseError):
    pass


class InvalidDictionary(DatabaseError):
    pass

def get_user_by_userid(userid):
    try:
        user = USER.get(str(userid), columns=['name', 'password'])
Exemplo n.º 18
0
    if keyspace in sysm.list_keyspaces():
        sysm.drop_keyspace(keyspace)
    sysm.create_keyspace(keyspace, system_manager.SIMPLE_STRATEGY,
                         {'replication_factor': '1'})
    sysm.create_column_family(keyspace, columnfamily)
    sysm.alter_column(keyspace, columnfamily, 'strcol', system_manager.ASCII_TYPE)
    sysm.alter_column(keyspace, columnfamily, 'intcol', system_manager.INT_TYPE)
    sysm.alter_column(keyspace, columnfamily, 'longcol', system_manager.LONG_TYPE)
    sysm.alter_column(keyspace, columnfamily, 'floatcol', system_manager.FLOAT_TYPE)
    sysm.alter_column(keyspace, columnfamily, 'doublecol', system_manager.DOUBLE_TYPE)
    sysm.alter_column(keyspace, columnfamily, 'datecol', system_manager.DATE_TYPE)


if __name__ == "__main__":
    if len(sys.argv) < 2:
        print "Error.  Pass the name of the YAML configuration file as parameter."
        sys.exit(-1)
    conffile = sys.argv[1]

    sysm = system_manager.SystemManager()
    setup_keyspace(sysm)
    pool = ConnectionPool(keyspace)
    cf = ColumnFamily(pool, columnfamily)

    # Write and read keys
    write(cf)
    clist = read_cl(cf)
    #print "First rows of clist ->", clist[:10]
    sarray = read_np(cf, conffile)
    print "First rows of sarray->", repr(sarray[:10])
Exemplo n.º 19
0
 def setUp(self):
     n = 10000
     self.weibos = self._load_items(n)
     pool = ConnectionPool('master_timeline', server_list=['219.224.135.60:9160', '219.224.135.61:9160'], pool_size=10)
     col_fam = pycassa.ColumnFamily(pool, 'weibos')
     self.weibos_col_fam = col_fam
Exemplo n.º 20
0
import random
import sys
import pycassa

from random import choice
from random import sample
from pycassa.index import *
from pycassa.pool import ConnectionPool
from pycassa.columnfamily import ColumnFamily
from datetime import datetime

t1 = datetime.now()
pool = ConnectionPool('JBJ')
col_fam_master = pycassa.ColumnFamily(pool, 'Master')

delta = 0.003
T0 = 2

#Function: Load graph from a file
def get_color_degree(graph,index,color):
	neighbors = list(graph[index][1])
	energy = 0
	for neighbor in neighbors:
		if graph[neighbor][0] == color:
			energy +=1
	return energy
	
def get_neighbor(graph, index):
	temp = list(graph[index][1])
	return temp[random.randint(0,len(temp)-1)]
	
Exemplo n.º 21
0
from pycassa.cassandra.ttypes import *
from pycassa.pool import ConnectionPool
from pycassa.columnfamily import ColumnFamily

try:
    cp = ConnectionPool("demo")
    cf = ColumnFamily(cp, "Test3")
    cf.insert('2345', 'ss')
    x = cf.get('1234')
    print(x)
except InvalidRequestException as e:
    print("ERROR " + e.why)
except NotFoundException as e:
    print("ERROR " + e.why)

Exemplo n.º 22
0
#!/usr/bin/python
#-*- coding:utf-8 -*-
from pycassa.pool import ConnectionPool
from pycassa.columnfamily import ColumnFamily

pool = ConnectionPool('MyKeyspace')
cf = ColumnFamily(pool, 'MyCF')
#cf.insert('row_key', {'col_name': 'col_val'})
#cf.insert('row_key', {'col_name':'col_val', 'col_name2':'col_val2'})
#cf.batch_insert({'row1': {'name1': 'val1', 'name2': 'val2'},'row2': {'foo': 'bar'}})

print cf.get('row_key')
print cf.get('row_key', columns=['col_name', 'col_name2'])

#for i in xrange(10):
#    cf.insert('row_key', {str(i): 'val'})
print cf.get('row_key', column_start='5', column_finish='7')
print cf.get('row_key', column_reversed=True, column_count=3)
print cf.multiget(['row1', 'row2'])

result = cf.get_range(start='row_key5', finish='row_key7')
for key, columns in result:
    print key, '=>', columns

Exemplo n.º 23
0
import sys
from datetime import datetime as dt
from functools import wraps
import zlib
import msgpack
import pycassa
from pycassa.pool import ConnectionPool
from pycassa.index import create_index_clause, create_index_expression
from pycassa.cassandra.ttypes import NotFoundException, ConsistencyLevel
from pyhackers.common import unix_time_millisecond, time_with_ms, epoch_to_date, unix_time
from pyhackers.config import config

pool = ConnectionPool("sweetio", [config.cassandra])
status_cf = pycassa.ColumnFamily(pool, "status")
user_timeline_cf = pycassa.ColumnFamily(pool, "user_timeline")
user_cf = pycassa.ColumnFamily(pool, "user2")
channel_timeline_cf = pycassa.ColumnFamily(pool, "channel_timeline")

#create column family user_following_timeline with comparator = IntegerType;
user_following_timeline_cf = pycassa.ColumnFamily(pool, "user_following_timeline")
counters_cf = pycassa.ColumnFamily(pool, "counters")

status_upvotes_cf = pycassa.ColumnFamily(pool, "status_upvotes")
status_downvotes_cf = pycassa.ColumnFamily(pool, "status_downvotes")
status_replies_cf = pycassa.ColumnFamily(pool, "status_replies")
status_resweets_cf = pycassa.ColumnFamily(pool, "status_resweets")
status_favs_cf = pycassa.ColumnFamily(pool, "status_favs")

user_follower_cf = pycassa.ColumnFamily(pool, "user_followers")
user_following_cf = pycassa.ColumnFamily(pool, "user_following")
Exemplo n.º 24
0
def query2(user_id):#
    pool = ConnectionPool('tuitterdb')
    followsTweets_family = ColumnFamily(pool,'followsTweets')
    # Print the tweets of the followers of user supplied in the parameter.
    query(followsTweets_family,user_id)
    pool.dispose()
Exemplo n.º 25
0
        with open(filename, 'w') as f:
            f.write(
                zlib.compress(
                    cPickle.dumps(self.current_day_bitarray,
                                  protocol=cPickle.HIGHEST_PROTOCOL)))

    def union_current_day(self, bf):
        """Union only the current_day of an other BF."""
        self.bitarray = self.bitarray | bf.current_day_bitarray


if __name__ == "__main__":
    import numpy as np
    from pycassa.pool import ConnectionPool

    pool = ConnectionPool('parsely')

    bf = DailyTemporalBloomFilter(100000, 0.01, 60, 'session_site', './', pool)

    random_items = [str(r) for r in np.random.randn(200000)]
    for item in random_items[:100000]:
        bf.add(item)

    false_positive = 0
    for item in random_items[100000:200000]:
        if item in bf:
            false_positive += 1

    print "Error rate (false positive): %s" % str(
        float(false_positive) / 100000)
Exemplo n.º 26
0
#!/usr/bin/python
#-*- coding:utf-8 -*-
import csv
import glob
from pycassa.system_manager import *
from pycassa.pool import ConnectionPool
from pycassa.columnfamily import ColumnFamily

sys = SystemManager('localhost:9160')
#sys.create_keyspace('employees', SIMPLE_STRATEGY, {'replication_factor': '1'})
pool = ConnectionPool('employees')
filenames = glob.glob('employees/*.csv')
for filename in filenames:
    only_name = filename.split('/')[-1].split('.')[0]
    print only_name
    #sys.create_column_family('employees',only_name,super=False)
    #sys.drop_column_family('employees',only_name)
    cf = ColumnFamily(pool, only_name)
    csv_file = open(filename, 'rb')
    reader = csv.reader(csv_file)
    r = 1
    for row in reader:
        c = 1
        for col in row:
            cf.insert('row' + str(r), {'col' + str(c): col})
            c += 1
        r += 1
Exemplo n.º 27
0
 #nodeCfg = {'ip': '172.16.40.147','hostname':'centos123','user':'******','passwd':'1','desc':u'没有表述'}
 nodeinfo1 = {'software':'apache2','ver':'2.0.1','docbase':'/var/www'}
 nodeinfo2 = {'software':'cassandra','ver':'1.0.12','seeds':'172.16.40.145'}
 cols = [{'software':UTF8_TYPE,'ver':UTF8_TYPE}]
 #CreateCFByDefaultConn(ks,testcf,cols)
 #ip_name:
 #   apache:ver
 #   apache:docbase
 #   cassandra:ver
 #   cassandra:seeds
 #comparator = CompositeType(UTF8Type(), UTF8Type(),UTF8Type())
 #cols = [{"param":comparator}]
 
 #CreateCompositeCF(getConnectString()[0],ks,testcf,None,[comparator])
 
 pool = ConnectionPool(ks, getConnectString())
 #print(pool)
 # key = '172.16.40.145:cent_client1:'+nodeinfo1.get('software')
 # print(key)
 #UpdateValue(pool,testcf[0],'172.16.40.147',{('172.16.40.147','centos123','tomcat','7.0','port'):'8080'})
 #key = '172.16.40.145:cent_client1:'+nodeinfo2.get('software')
 #print(key)
 #UpdateValue(pool,testcf[0],key,nodeinfo2)
 
 #s = GetValue(pool,'testcf','172.16.40.145')
 #print(s)
 # update column family testcf with column_metadata=[{column_name:docbase, validation_class: UTF8Type, index_type: KEYS}]
 
 '''
 CREATE TABLE testcf (
   key ascii,
import pycassa
from pycassa.pool import ConnectionPool
from pycassa.columnfamily import ColumnFamily
import csv
import time

pool = ConnectionPool('highwaydata',
                      ['10.138.0.5', '10.138.0.4', '10.138.0.3'],
                      use_threadlocal=False,
                      pool_size=3)

detectorFile = '/home/highway_data/csv_fies/ProjectData-Cloud2015/freeway_detectors.csv'
loopFile = '/home/highway_data/csv_fies/ProjectData-Cloud2015/freeway_loopdata.csv'

superLoops_start_time = time.time()
print('starting to load detector loopdata supercolumn family')
with open(detectorFile, 'rU') as fin:
    cin = csv.DictReader(fin)
    detectorData = [row for row in cin]

col_fam_detectors = ColumnFamily(pool, 'superLoops')

for detector in detectorData:
    detectorid = detector['detectorid']

    col_fam_detectors.insert(
        detectorid, {
            'detectorInfo': {
                'highwayid': detector['highwayid'],
                'milepost': detector['milepost'],
                'locationtext': detector['locationtext'],
Exemplo n.º 29
0
def setup_module():
    global pool
    credentials = {'username': '******', 'password': '******'}
    pool = ConnectionPool(TEST_KS, pool_size=10, credentials=credentials)
Exemplo n.º 30
0
    'mobile:Your Club Specials', 
    'mobile:Weekly Specials', 
    'mobile:savings'
]
    
pageNameList = [
    'mobile:safeway:savings', 
    'mobile:safeway:savings:couponctr', 
    'mobile:safeway:savings:personaldeal', 
    'mobile:safeway:savings:clubspecial', 
    'mobile:safeway:Weekly Specials', 
    'mobile:safeway:Your Club Specials'
]

print 'Connecting to Cassandra ' + gKeyspace + '/' + gColumnFamily + '...'
pool = ConnectionPool(gKeyspace, ['10.5.14.58:9160'])
deviceLogCF = pycassa.ColumnFamily(pool, gColumnFamily) 
print 'Connected to ' + gKeyspace + '/' + gColumnFamily

if len(sys.argv) < 2:
    usage()
    sys.exit()

operation = sys.argv[1]

option = "full"
if (len(sys.argv) >= 3):
    option = sys.argv[2]

if (operation == 'insert'):
    # profile1()
Exemplo n.º 31
0
#EXEC
import mdER
import mdNeural
import umisc
import sys
sys.path.append('./pymongo')
sys.path.append('./pycassa')
import pycassa
from pycassa.pool import ConnectionPool
from pycassa import index
from pycassa.columnfamily import ColumnFamily
import pymongo
import bson
#============ base local de testes ============================
pool2 = ConnectionPool('MINDNET', ['localhost:9160'], timeout=10)
to_posting = pycassa.ColumnFamily(pool2, 'to_posting')
wb3 = pycassa.ColumnFamily(pool2, 'web_cache3')
to_posting2 = pycassa.ColumnFamily(pool2, 'to_posting')
to_posting3 = pycassa.ColumnFamily(pool2, 'to_posting')
#=========== base producao  =============================
'''
MONGO_URL='mongodb://*****:*****@91.205.172.85:27017/mdnet'
connMC = pymongo.Connection(MONGO_URL)
dbMC=connMC.mdnet
to_posting1=dbMC['to_posting']
'''
'''
pool2 = ConnectionPool('MINDNET', ['79.143.185.3:9160'],timeout=10000)
#to_posting2 = pycassa.ColumnFamily(pool2, 'to_posting')
to_posting3 = pycassa.ColumnFamily(pool2, 'to_posting2')
Exemplo n.º 32
0
from pycassa import index

import logging
from StringIO import StringIO

logging.basicConfig(level=logging.DEBUG)
log = logging.getLogger('pblnksExtra')

ch = logging.StreamHandler()
lbuffer = StringIO()
logHandler = logging.StreamHandler(lbuffer)

log.addHandler(logHandler)
log.addHandler(ch)

pool2 = ConnectionPool('MINDNET', ['79.143.185.3:9160'], timeout=10)
tab2 = pycassa.ColumnFamily(pool2, 'cache_products')
wb2 = pycassa.ColumnFamily(pool2, 'web_cache3')  # lugar para indexar


def short_url(urllong):
    return bitly.short_url(urllong)


def lomadeezar_links(links_tw):
    areturn = []
    lnk = []
    ind = 1
    for l in links_tw:
        #l=urllib.quote(l)
        lnk.append(['link' + str(ind), l])
Exemplo n.º 33
0
    def test_pool(self):
        listener = StatsLogger()
        pool = ConnectionPool(pool_size=5, max_overflow=5, recycle=10000,
                              prefill=True, pool_timeout=0.1, timeout=1,
                              keyspace='PycassaTestKeyspace', credentials=_credentials,
                              listeners=[listener], use_threadlocal=False)
        conns = []
        for i in range(10):
            conns.append(pool.get())
        assert_equal(listener.stats['created']['success'], 10)
        assert_equal(listener.stats['created']['failure'], 0)
        assert_equal(listener.stats['checked_out'], 10)
        assert_equal(listener.stats['opened'], {'current': 10, 'max': 10})

        # Pool is maxed out now
        assert_raises(NoConnectionAvailable, pool.get)
        assert_equal(listener.stats['created']['success'], 10)
        assert_equal(listener.stats['checked_out'], 10)
        assert_equal(listener.stats['opened'], {'current': 10, 'max': 10})
        assert_equal(listener.stats['at_max'], 1)

        for i in range(0, 5):
            pool.return_conn(conns[i])
        assert_equal(listener.stats['disposed']['success'], 0)
        assert_equal(listener.stats['checked_in'], 5)
        assert_equal(listener.stats['opened'], {'current': 5, 'max': 10})

        for i in range(5, 10):
            pool.return_conn(conns[i])
        assert_equal(listener.stats['disposed']['success'], 5)
        assert_equal(listener.stats['checked_in'], 10)

        conns = []

        # These connections should come from the pool
        for i in range(5):
            conns.append(pool.get())
        assert_equal(listener.stats['created']['success'], 10)
        assert_equal(listener.stats['checked_out'], 15)

        # But these will need to be made
        for i in range(5):
            conns.append(pool.get())
        assert_equal(listener.stats['created']['success'], 15)
        assert_equal(listener.stats['checked_out'], 20)

        assert_equal(listener.stats['disposed']['success'], 5)
        for i in range(10):
            conns[i].return_to_pool()
        assert_equal(listener.stats['checked_in'], 20)
        assert_equal(listener.stats['disposed']['success'], 10)

        assert_raises(InvalidRequestError, conns[0].return_to_pool)
        assert_equal(listener.stats['checked_in'], 20)
        assert_equal(listener.stats['disposed']['success'], 10)

        print("in test:", id(conns[-1]))
        conns[-1].return_to_pool()
        assert_equal(listener.stats['checked_in'], 20)
        assert_equal(listener.stats['disposed']['success'], 10)

        pool.dispose()
Exemplo n.º 34
0
import os,sys 
import pycassa 
import logging 
from pycassa.pool import ConnectionPool
from pycassa.columnfamily import ColumnFamily
from datetime import datetime as d 

__author__ = 'rahul'


logging.basicConfig(filename="example.log",level=logging.DEBUG,format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S')

pool = ConnectionPool('icepice')
col_fam = ColumnFamily(pool, 'info')

#Sample Insert operations 
#col_fam.insert('row_key',{'col_name' : 'value' })

'''Sample Get operations. 
Need to wrap the get operations in exception block for safe failing from
'key not found error'.
'''
try:
  for i in range(0,1000):
    s = d.now()
    col_fam.get(i)
    logging.info(str(i) +"=>"+ str((d.now() - s).microseconds))

except:
  logging.info("Error" + str(sys.exc_info()[0]))
Exemplo n.º 35
0
    def __init__(self, keyspace, column_family_name):
        self.pool = ConnectionPool(keyspace, cassandra_settings.NODE_POOL)

        self.cf = ColumnFamily(self.pool, column_family_name)

        self.batch = {}
Exemplo n.º 36
0
# Include python library for pycassa
import sys
import functions
import node
import pycassa

from pycassa.index import *
from datetime import datetime

# Connect to Cassandra Instance
from pycassa.pool import ConnectionPool
pool = ConnectionPool('OSN')
#pool = ConnectionPool('OSN', ['localhost:9160'])

from pycassa.columnfamily import ColumnFamily
col_fam_master = pycassa.ColumnFamily(pool, 'Master')
col_fam_replica = pycassa.ColumnFamily(pool, 'Replica')
col_fam_edge = pycassa.ColumnFamily(pool, 'Edge')
col_fam_mme = pycassa.ColumnFamily(pool, 'Master_Master_Edge')
col_fam_mse = pycassa.ColumnFamily(pool, 'Master_Slave_Edge')

# Input: Number of servers & number of replicas (K-redundancy)
total_servers = 4
total_replicas = 2

# Variables
server_id = 0
num_replica = 0
replica_id = 0

total_replicas_needed = 0
Exemplo n.º 37
0

# set up the cassandra object
cass = pycassa.system_manager.SystemManager('localhost')

# Normally you wouldn't drop the keyspace first
# I only do it here to make everything clean
print "Dropping keyspace"
if 'jedberg_test' in cass.list_keyspaces():
    cass.drop_keyspace('jedberg_test')

# create the keyspace
print "Creating keyspace"
cass.create_keyspace('jedberg_test', topology, {'replication_factor': '1'})
cass.ks = 'jedberg_test'
pool = ConnectionPool('jedberg_test')
conn = pool.get()

# create the column families
families = ['collected_properties',
            'collection_cache_by_times',
            'collections_by_cache']

print "Creating column families"
for fam in families:
    cass.create_column_family(cass.ks, fam)

# Let's see if those keyspaces are set up correctly
print "Keyspaces: "
print cass.list_keyspaces()
print
Exemplo n.º 38
0
    def test_pool(self):
        listener = StatsLogger()
        pool = ConnectionPool(pool_size=5, max_overflow=5, recycle=10000,
                              prefill=True, pool_timeout=0.1, timeout=1,
                              keyspace='PycassaTestKeyspace', credentials=_credentials,
                              listeners=[listener], use_threadlocal=False)
        conns = []
        for i in range(10):
            conns.append(pool.get())
        assert_equal(listener.stats['created']['success'], 10)
        assert_equal(listener.stats['created']['failure'], 0)
        assert_equal(listener.stats['checked_out'], 10)
        assert_equal(listener.stats['opened'], {'current': 10, 'max': 10})

        # Pool is maxed out now
        assert_raises(NoConnectionAvailable, pool.get)
        assert_equal(listener.stats['created']['success'], 10)
        assert_equal(listener.stats['checked_out'], 10)
        assert_equal(listener.stats['opened'], {'current': 10, 'max': 10})
        assert_equal(listener.stats['at_max'], 1)

        for i in range(0, 5):
            pool.return_conn(conns[i])
        assert_equal(listener.stats['disposed']['success'], 0)
        assert_equal(listener.stats['checked_in'], 5)
        assert_equal(listener.stats['opened'], {'current': 5, 'max': 10})

        for i in range(5, 10):
            pool.return_conn(conns[i])
        assert_equal(listener.stats['disposed']['success'], 5)
        assert_equal(listener.stats['checked_in'], 10)

        conns = []

        # These connections should come from the pool
        for i in range(5):
            conns.append(pool.get())
        assert_equal(listener.stats['created']['success'], 10)
        assert_equal(listener.stats['checked_out'], 15)

        # But these will need to be made
        for i in range(5):
            conns.append(pool.get())
        assert_equal(listener.stats['created']['success'], 15)
        assert_equal(listener.stats['checked_out'], 20)

        assert_equal(listener.stats['disposed']['success'], 5)
        for i in range(10):
            conns[i].return_to_pool()
        assert_equal(listener.stats['checked_in'], 20)
        assert_equal(listener.stats['disposed']['success'], 10)

        assert_raises(InvalidRequestError, conns[0].return_to_pool)
        assert_equal(listener.stats['checked_in'], 20)
        assert_equal(listener.stats['disposed']['success'], 10)

        print "in test:", id(conns[-1])
        conns[-1].return_to_pool()
        assert_equal(listener.stats['checked_in'], 20)
        assert_equal(listener.stats['disposed']['success'], 10)

        pool.dispose()
Exemplo n.º 39
0
def serve_stats(dmu,dmu2):
 global atu_reg
 s.bind((host, port))        # Bind to the port
 s.listen(5)                 # Now wait for client connection.
 while True:
   c, addr = s.accept()     # Establish connection with client.
   #atu_reg= 'Got connection from', addr
   msg = c.recv(1024)
   #print addr, ' >> ', msg
   msg = str(atu_reg)
   c.send(msg);
   

atu_reg= 'Connect to cassandra...'
pool2 = ConnectionPool('MINDNET', ['91.205.172.85:9160'],timeout=10000)
fcb = pycassa.ColumnFamily(pool2, 'fcb_users2')

fcb2 = pycassa.ColumnFamily(pool2, 'fcb_users3')


thread.start_new_thread(serve_stats,(0,0) )

ind_files=1

total_collected=0


import os.path

Exemplo n.º 40
0
def query1(user_id):
    pool = ConnectionPool('tuitterdb')
    userTweets_family = ColumnFamily(pool, 'userTweets')
    # Print the tweets of the user supplied in the parameter. 
    query(userTweets_family,user_id)
    pool.dispose()
Exemplo n.º 41
0
import pycassa
from pycassa.pool import ConnectionPool
from pycassa.columnfamily import ColumnFamily
import csv

pool = ConnectionPool('highwayData', ['localhost:9160'])

#change to not super column
col_fam = ColumnFamily(pool, 'stationid')
print('\n\ngetting record for station 1098 from the stations column family')
print(col_fam.get('1098'))
print('\n\n')

col_fam_detectors = ColumnFamily(pool, 'detectors')
print(
    'getting record for detector 1345, 09-15-2011 from the detectors & loopdata super-column family'
)
print('record check: 1345,2011-09-15 00:00:00-07,0,,0,0,0')
print(col_fam_detectors.get(
    '1345',
    columns=['2011-09-15 00:00:00-07'],
))
print('\n\n')
#print(col_fam_detectors.get('1345'))

#1346,2011-09-24 21:21:20-07,7,63,11,2,0
#1348,2011-11-06 03:53:20-08,0,,0,0,0
Exemplo n.º 42
0
#captura web_know

import pycassa
from pycassa.pool import ConnectionPool
from pycassa import index
from pycassa.columnfamily import ColumnFamily

pool2 = ConnectionPool('MINDNET', ['91.205.172.85:9160'], timeout=10000)

pool1 = ConnectionPool('MINDNET', ['79.143.185.3:9160'], timeout=10000)

tb_web1 = pycassa.ColumnFamily(pool1, 'web_know')
tb_web2 = pycassa.ColumnFamily(pool2, 'web_know2')
tb_web2 = pycassa.ColumnFamily(pool2, 'web_know')
#===
tb_web1.truncate()

rg1 = tb_web2.get_range()
ind = 0
for k, r in rg1:
    tb_web1.insert(k, r)
    print r
    ind += 1
    if ind % 1000 == 0:
        print 'ind:', ind
Exemplo n.º 43
0
from pycassa.index import *
from pycassa.cassandra import ttypes
import json
import datetime


class JSONDateTimeEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, (datetime.date, datetime.datetime)):
            return obj.isoformat()
        else:
            return json.JSONEncoder.default(self, obj)


def has_many_woeids(x):
    key, values = x
    return len(values) > 1


pool = ConnectionPool('processing_llama_Processor')
trends = ColumnFamily(pool, 'Trend')

for trend_name, country_specifics in filter(has_many_woeids,
                                            trends.get_range()):
    print json.dumps(country_specifics,
                     sort_keys=True,
                     indent=4,
                     separators=(',', ': '),
                     cls=JSONDateTimeEncoder)
    #track_trend(trend_name, country_specifics['query'], country_specifics.keys)
Exemplo n.º 44
0
import time
import pycassa
from pycassa.pool import ConnectionPool
from pycassa.columnfamily import ColumnFamily

import sys

sys.path.append('/Neural')

import conn3
conn = conn3.conn_mx

pool2 = ConnectionPool('MINDNET', ['localhost:9160'], timeout=10)
tab2 = pycassa.ColumnFamily(pool2, 'fcb_users')
'''
  create column family fcb_users 
    with comparator = UTF8Type
    and default_validation_class = UTF8Type
    and column_metadata = [{
        column_name : user_name,
        validation_class : UTF8Type,
        index_name : user_name_idx1,
        index_type : 0},
        {
        column_name : id,
        validation_class : UTF8Type,
        index_name : id_idx1,
        index_type : 0},
        {
        column_name : u_name,
        validation_class : UTF8Type,
class Cassa(object):
    '''
    Provides a simple key=value functionality built on a cassandra
    table with a key and a single column.  Used in ZookeeperTaskQueue
    to replace the two tables that grow in size with the number of
    tasks rather than the number of workers.
    '''

    def __init__(self, namespace, server_list=['localhost:9160']):
        # save cassandra server
        self.server_list = server_list
        self.namespace = namespace
        self._closed = False

        #setup_logging(self)

        # Connect to the server creating the namespace if it doesn't
        # already exist
        try:
            self.pool = ConnectionPool(namespace, self.server_list, max_retries=500, pool_timeout=600, timeout=10)
        except pycassa.InvalidRequestException:
            self._create_namespace(namespace)
            self.pool = ConnectionPool(namespace, self.server_list, max_retries=500, pool_timeout=600, timeout=10)

        try:
            self._tasks = pycassa.ColumnFamily(self.pool, 'tasks')
        except pycassa.NotFoundException:
            self._create_column_family('tasks', 
                                       key_validation_class=ASCII_TYPE, 
                                       bytes_columns=['task_data'])
            self._tasks = pycassa.ColumnFamily(self.pool, 'tasks')

        try:
            self._available = pycassa.ColumnFamily(self.pool, 'available')
        except pycassa.NotFoundException:
            self._create_column_family('available', 
                                        key_validation_class=ASCII_TYPE, 
                                        bytes_columns=['available'])
            self._available = pycassa.ColumnFamily(self.pool, 'available')

        try:
            self._task_count = pycassa.ColumnFamily(self.pool, 'task_count')
        except pycassa.NotFoundException:
            self._create_counter_column_family('task_count', 
                                       key_validation_class=ASCII_TYPE, 
                                       counter_columns=['task_count'])
            self._task_count = pycassa.ColumnFamily(self.pool, 'task_count')
            self._task_count.insert('RowKey', {'task_count': 0})

        try:
            self._available_count = pycassa.ColumnFamily(self.pool, 'available_count')
        except pycassa.NotFoundException:
            self._create_counter_column_family('available_count', 
                                       key_validation_class=ASCII_TYPE, 
                                       counter_columns=['available_count'])
            self._available_count = pycassa.ColumnFamily(self.pool, 'available_count')
            self._available_count.insert('RowKey', {'available_count': 0})

    def delete_namespace(self):
        sm = SystemManager(random.choice(self.server_list))
        sm.drop_keyspace(self.namespace)
        sm.close()

    def _create_namespace(self, namespace):
        sm = SystemManager(random.choice(self.server_list))
        sm.create_keyspace(namespace, SIMPLE_STRATEGY, {'replication_factor': '1'})
        sm.close()

    def _create_column_family(self, family, bytes_columns=[], 
                              key_validation_class=TIME_UUID_TYPE):
        '''
        Creates a column family of the name 'family' and sets any of
        the names in the bytes_column list to have the BYTES_TYPE.

        key_validation_class defaults to TIME_UUID_TYPE and could also
        be ASCII_TYPE for md5 hash keys, like we use for 'inbound'
        '''
        sm = SystemManager(random.choice(self.server_list))
        # sys.create_column_family(self.namespace, family, super=False)
        sm.create_column_family(self.namespace, family, super=False,
                key_validation_class = key_validation_class, 
                default_validation_class  = TIME_UUID_TYPE,
                column_name_class = ASCII_TYPE)
        for column in bytes_columns:
            sm.alter_column(self.namespace, family, column, BYTES_TYPE)
        sm.close()

    def _create_counter_column_family(self, family, counter_columns=[],
                              key_validation_class=UTF8Type):
        '''
        Creates a column family of the name 'family' and sets any of
        the names in the bytes_column list to have the BYTES_TYPE.

        key_validation_class defaults to TIME_UUID_TYPE and could also
        be ASCII_TYPE for md5 hash keys, like we use for 'inbound'
        '''
        sm = SystemManager(random.choice(self.server_list))
        # sys.create_column_family(self.namespace, family, super=False)
        sm.create_column_family(self.namespace, family, super=False,
                key_validation_class = key_validation_class, 
                default_validation_class="CounterColumnType",
                column_name_class = ASCII_TYPE)
        for column in counter_columns:
            sm.alter_column(self.namespace, family, column, COUNTER_COLUMN_TYPE)
        sm.close()

    def tasks(self, key_prefix=''):
        '''
        generate the data objects for every task
        '''
        for row in self._tasks.get_range():
            logger.debug(row)
            if not row[0].startswith(key_prefix):
                continue
            data = json.loads(row[1]['task_data'])
            data['task_key'] = row[0]
            yield data

    def put_task(self, key, task_data):
        try:
            found = self._tasks.get(key, column_count=1)
            exists = True
        except pycassa.cassandra.ttypes.NotFoundException:
            exists = False

        self._tasks.insert(key, {'task_data': json.dumps(task_data)})
        if not exists:
            self._task_count.insert('RowKey', {'task_count': 1})
        return exists

    def get_task(self, key):
        data = self._tasks.get(key)
        return json.loads(data['task_data'])

    def pop_task(self, key):
        self._tasks.remove(key)
        self._task_count.insert('RowKey', {'task_count': -1})
        return key

    @property
    def task_keys(self):
        c = 0
        for key, _ in self._tasks.get_range(column_count=0, filter_empty=False):
            c += 1
            yield key

    def num_tasks(self):
        data = self._task_count.get('RowKey')
        return data['task_count']

    def num_available(self):
        data = self._available_count.get('RowKey')
        return data['available_count']

    def put_available(self, key):
        ## closest thing to storing only the key
        try:
            found = self._available.get(key, column_count=1)
            exists = True
        except pycassa.cassandra.ttypes.NotFoundException:
            exists = False

        if not exists:
            self._available.insert(key, {'available': ''})
            self._available_count.insert('RowKey', {'available_count': 1})

    #def push_batch(self, row_iter):
    #    '''
    #    Push opaque vertex data objects into the inbound queue
    #    '''
    #    return self._tasks.batch_insert({k: json.dumps(v) for k, v in row_iter})

    def get_random_available(self, max_iter=10000):
        '''
        get a random key out of the first max_iter rows
        '''
        c = 1
        keeper = None
        ## note the ConsistencyLevel here.  If we do not do this, and
        ## get all slick with things like column_count=0 and filter
        ## empty False, then we can get keys that were recently
        ## deleted... EVEN if the default consistency would seem to
        ## rule that out!

        ## note the random start key, so that we do not always hit the
        ## same place in the key range with all workers
        #random_key = hashlib.md5(str(random.random())).hexdigest()
        #random_key = '0' * 32
        #logger.debug('available.get_range(%r)' % random_key)
        ## scratch that idea: turns out that using a random start key
        ## OR using row_count=1 can cause get_range to hang for hours

        ## why we need ConsistencyLevel.ALL on a single node is not
        ## clear, but experience indicates it is needed.

        ## note that putting a finite row_count is problematic in two
        ## ways:
        # 1) if there are more workers than max_iter, some will not
        # get tasks
        #
        # 2) if there are more than max_iter records, then all workers
        # have to wade through all of these just to get a task!  What
        # we really want is a "pick random row" function, and that is
        # probably best implemented using CQL3 token function via the
        # cql python module instead of pycassa...
        for row in self._available.get_range(row_count=max_iter, read_consistency_level=pycassa.ConsistencyLevel.ALL):
        #for row in self._available.get_range(row_count=100):
            logger.debug('considering %r' % (row,))
            if random.random() < 1 / c:
                keeper = row[0]
            if c == max_iter:
                break
            c += 1
        return keeper

    def in_available(self, key):
        try:
            row = self._available.get(key)
            return True
        except pycassa.NotFoundException:
            return False

    def pop_available(self, key):
        self._available.remove(key, write_consistency_level=pycassa.ConsistencyLevel.ALL)
        self._available_count.insert('RowKey', {'available_count': -1})
        assert not self.in_available(key)
        return key

    def close(self):
        self._closed = True
        if hasattr(self, 'pool'):
            self.pool.dispose()