def refresh(self, items, consistent=False): """ Overwrite model data with freshest from database Parameters ---------- items : list or :class:`~flywheel.models.Model` Models to sync consistent : bool, optional If True, force a consistent read from the db. (default False) """ if isinstance(items, Model): items = [items] if not items: return tables = defaultdict(list) for item in items: tables[item.meta_.ddb_tablename].append(item) for tablename, items in tables.iteritems(): table = Table(tablename, connection=self.dynamo) keys = [item.pk_dict_ for item in items] results = table.batch_get(keys, consistent=consistent) for item, data in itertools.izip(items, results): with item.loading_(self): for key, val in data.items(): item.set_ddb_val_(key, val)
class Comment(object): def __init__(self): self.table = Table('Comment') def create(self, comment_id, comment_info): data = dict( comment_id=comment_id, info=comment_info ) self.table.put_item(data=data) def get(self, comment_id): comment = self.table.get_item(comment_id=comment_id) comment_info = comment['info'] return comment_info def update(self, comment_id, comment_info): comment = self.table.get_item(comment_id=comment_id) comment['info'] = comment_info comment.save() def batch_query(self, comment_id_list): keys = [] for comment_id in comment_id_list: keys.append(dict( comment_id=comment_id )) many_comments = self.table.batch_get(keys=keys) comment_info_list = [] for comment in many_comments: comment_info_list.append(comment['info']) return comment_info_list
class Post(object): def __init__(self): self.table = Table('Post') def create(self, post_id, post_info): data = dict( post_id=post_id, info=post_info ) self.table.put_item(data=data) def get(self, post_id): post = self.table.get_item(post_id=post_id) post_info = post['info'] return post_info def update(self, post_id, post_info): post = self.table.get_item(post_id=post_id) post['info'] = post_info post.save() def batch_query(self, post_id_list): keys = [] for post_id in post_id_list: keys.append(dict( post_id=post_id )) many_posts = self.table.batch_get(keys=keys) post_info_list = [] for post in many_posts: post_info_list.append(post['info']) return post_info_list
class Collection(CollectionEngine): def __init__(self, name, table_name, region, host=None, is_secure=None, port=None): kwargs = {} if host is not None: kwargs['host'] = host if is_secure is not None: kwargs['is_secure'] = is_secure if port is not None: kwargs['port'] = port self.__table = Table( table_name=table_name, connection=boto.dynamodb2.connect_to_region(region, **kwargs), ) @property def table(self): return self.__table def create_raw_item(self, index, data_dict, context): if context is None: self.__table.put_item(data_dict) else: context.put_item(self.__table, data_dict) return BotoItem(self.__table, data_dict, True) def retrieve_raw_item(self, key_dict): try: return self.__table.get_item(**key_dict) except ItemNotFound: raise KeyError(key_dict) except: raise def query_raw_items(self, index, parent_key_value, **kwargs): if parent_key_value is not None: kwargs['{}__eq'.format(parent_key_value[0])] = parent_key_value[1] return self.__table.query(index=index, **kwargs) def bulk_get_raw_items(self, **kwargs): return self.__table.batch_get(**kwargs) @classmethod def get_context(cls, *args, **kwargs): return Context()
class Event(object): def __init__(self): self.table_event = Table('Event') def create_new_event(self, event_id, event_info): print(event_id) data = dict( event_id=event_id, info=event_info ) self.table_event.put_item(data=data) def get_event_info_by_event_id(self, event_id): event = self.table_event.get_item(event_id=event_id) event_info = event['info'] return event_info def update_event_info_by_event_id(self, event_id, event_info): event = self.table_event.get_item(event_id=event_id) event['info'] = event_info event.save() def batch_query_by_event_id_list(self, event_id_list): keys = [] for event_id in event_id_list: keys.append(dict( event_id=event_id )) many_events = self.table_event.batch_get(keys=keys) event_info_list = [] for event in many_events: event_info_list.append(dict( event_id=event['event_id'], info=event['info'] )) return event_info_list
class DynamoDBAdapter(key_value_store.KeyValueStore): """ Implementation of an abstract key-value store defined in key_value_store.py. The underlying database is amazon DynamoDB. The store keeps all objects in a single table with following schema: [HashKey('kind', data_type=STRING), RangeKey('id')]. 'kind' is the string with the object type ('vector', 'set' or 'int') and 'id' is the object id. The object value is stored in the 'value' attribute of the table items. The table should be created before this code is executed. Amazon configuration is assumed to be stored in ~/.boto file as described in http://boto.readthedocs.org/en/latest/boto_config_tut.html """ def __init__(self, precision=np.dtype('float32'), table_name='test'): """ Create an instance of the dynamodb key-value store. precision - a numpy type, elements of all vectors are converted and stored in this type; table_name - the name of the DynamoDB table which keeps the objects. """ conn = boto.dynamodb2.connect_to_region('eu-west-1') if not isinstance(precision, np.dtype): raise TypeError("Precision should be a numpy.dtype subtype") self.precision = precision self.precision_name = precision.name self.table = Table(table_name, connection=conn) def _get_or_create_item(self, kind, item_id): try: item = self.table.get_item(kind=kind, id=item_id) except ItemNotFound: item = Item(self.table) item['kind'] = kind item['id'] = item_id return item def _create_vector_item(self, vec_id, vector): item = self._get_or_create_item('vector', vec_id) item['value'] = Binary(vector.astype(self.precision).tostring()) item['precision'] = self.precision_name return item def _vector_value(self, item): return np.fromstring(str(item['value']), np.dtype(item['precision'])) def get_vector_ids(self): return [v['id'] for v in self.table.query_2(kind__eq='vector')] def get_int_ids(self): return [v['id'] for v in self.table.query_2(kind__eq='int')] def get_set_ids(self): return [v['id'] for v in self.table.query_2(kind__eq='set')] def store_vector(self, vec_id, vector): item = self._create_vector_item(vec_id, vector) item.save() def get_vector(self, vec_id): try: item = self.table.get_item(kind='vector', id=vec_id) except ItemNotFound: raise KeyError('Vector key %s is unknown' % (vec_id,)) return self._vector_value(item) def bulk_get_vector(self, vec_ids): keys = [{'kind': 'vector', 'id': i} for i in vec_ids] vs = self.table.batch_get(keys=keys) return [self._vector_value(i) for i in vs] def remove_vector(self, vec_id): try: item = self.table.get_item(kind='vector', id=vec_id) except ItemNotFound: raise KeyError('Vector key %s is unknown' % (vec_id,)) item.delete() def add_to_set(self, set_id, element_id): item = self._get_or_create_item('set', set_id) if 'value' not in item.keys() or not isinstance(item['value'], set): item['value'] = set() item['value'].add(element_id) item.save(overwrite=True) def remove_from_set(self, set_id, element_id): try: item = self.table.get_item(kind='set', id=set_id) except ItemNotFound: raise KeyError('Set key %s is unknown' % (set_id,)) if 'value' not in item.keys() or not isinstance(item['value'], set): raise KeyError('Incorrect value in item %s' % (set_id,)) if element_id not in item['value']: raise KeyError('Element %s not in set %s' % (element_id, set_id)) item['value'].remove(element_id) item.save() def remove_set(self, set_id): try: item = self.table.get_item(kind='set', id=set_id) item.delete() except ItemNotFound: raise KeyError('Set key %s is unknown' % (set_id,)) def get_set(self, set_id): try: the_set = self.table.get_item(kind='set', id=set_id)['value'] return set([str(entry) for entry in the_set]) except ItemNotFound: raise KeyError('Set key %s is unknown' % (set_id,)) def store_int(self, int_id, integer): item = self._get_or_create_item('int', int_id) item['value'] = integer item.save() def get_int(self, int_id): try: return int(self.table.get_item(kind='int', id=int_id)['value']) except ItemNotFound: raise KeyError('Int key %s is unknown' % (int_id,)) def remove_int(self, int_id): try: item = self.table.get_item(kind='int', id=int_id) except ItemNotFound: raise KeyError('Int key %s is unknown' % (int_id,)) item.delete() def _aggregate_set_id_element_pairs(self, setpairs): """Turns a list of pairs of the form (set_id, element_id) into a list 'L' of pairs 'p' of the form (set_id, set_of_element_ids). 'L' has the property that if 'p' and 'q' are distinct entries in 'L', then p[0] and q[0] are also distinct.""" set_ids = set([entry[0] for entry in setpairs]) listlist = [[entry for entry in setpairs if entry[0] == set_id] for set_id in set_ids] result = [(pairlist[0][0], set([entry[1] for entry in pairlist])) for pairlist in listlist] return result def bulk_store_vector(self, vec_ids, vectors): if len(vec_ids) != len(vectors): raise ValueError vecpairs = zip(vec_ids, vectors) with self.table.batch_write() as batch: for vec_id, vec in vecpairs: item = self._create_vector_item(vec_id, vec) batch.put_item(item) def bulk_store_vector_old(self, vectors_df): """Argument 'vectors' is a dataframe with index vector ids.""" if len(vec_ids) != len(vectors): raise ValueError with self.table.batch_write() as batch: for ind in vectors_df.index: vec_id = str(ind) vec = vectors_df.loc[ind].values item = self._create_vector_item(vec_id, vec) batch.put_item(item) def bulk_store_int(self, int_ids, integers): """Argument 'intpairs' is a list of pairs of the form (int_id, integer).""" if len(int_ids) != len(integers): raise ValueError intpairs = zip(int_ids, integers) with self.table.batch_write() as batch: for pair in intpairs: int_id, integer = pair item = self._get_or_create_item('int', int_id) item['value'] = integer batch.put_item(item) def bulk_add_to_set(self, set_ids, element_ids): """batch_write() objects if the same item is written to more than once per batch, hence we aggregate all (set_id, element_id) pairs into a list of pairs (set_id, element_ids), where the 'set_id's are pairwise distinct, and the 'element_ids' are sets.""" if len(set_ids) != len(element_ids): raise ValueError setpairs = zip(set_ids, element_ids) setlist = self._aggregate_set_id_element_pairs(setpairs) with self.table.batch_write() as batch: for pair in setlist: set_id, element_ids = pair item = self._get_or_create_item('set', set_id) if 'value' not in item.keys() or not isinstance( item['value'], set): item['value'] = set() item['value'].update(element_ids) batch.put_item(item)
class DDBSlurps(Dynamo): @classmethod def from_test_mode(cls, access_key=None, secret=None): """ Use this for getting an instance of this class that uses test tables. """ instance = cls(access_key, secret) instance.slurps_table = Table('test_slurps', connection=instance.connection) instance.failed_slurps_table = Table('test_failed_slurps', connection=instance.connection) return instance def __init__(self, access_key=None, secret=None): """ ! Use test_mode factory method for instantiating this class with test_slurps and test_failed_slurps tables """ super(DDBSlurps, self).__init__(access_key, secret) self.slurps_table = Table('slurps', connection=self.connection) self.failed_slurps_table = Table('failed_slurps', connection=self.connection) def save_slurp_info(self, slurp_info_, overwrite=True): """ slurp_info_ can either be in the form of a list of dicts or else a single dict. If slurp_info is a list, batch write will be used """ if isinstance(slurp_info_, dict): self.slurps_table.put_item(slurp_info_, overwrite=overwrite) elif isinstance(slurp_info_, list): with self.slurps_table.batch_write() as batch: for s in slurp_info_: batch.put_item(data=s, overwrite=overwrite) else: raise TypeError, "slurp_info must be a dict or a list of dicts, not a {}".format( type(slurp_info_)) def save_failed_slurp(self, searchterm): self.failed_slurps_table.put_item(data={ 'searchterm': searchterm, 'datetime': datetime.now().isoformat() }, overwrite=True) def get_slurp_info(self, search_term_=None): """ search_term_ can be either a string or a list of strings. Each string should be a search term you are looking for in the db. Returns either a single list of key-value tuples (if search_term_ was a string) or a list of key-value tuples (if search_term_ was a list) Each list of key-value tuples can easily be converted to a dict or an OrderedDict by the client. """ # searchterm_ is a STRING if isinstance(search_term_, basestring): if search_term_: slurp_info = (self.slurps_table.get_item( searchterm=search_term_)).items() else: slurp_info = [] # searchterm is a LIST of strings elif isinstance(search_term_, list): if search_term_: # create a set of non-empty searchterms. We us a set to avoid a duplicate query error from the db set_of_sts = {st for st in search_term_ if st} # create a list of dicts from the set list_of_st_dicts = [{'searchterm': st} for st in set_of_sts] res = self.slurps_table.batch_get(list_of_st_dicts) try: slurp_info = [i.items() for i in res] except (StopIteration, IndexError): # If res is empty, we get one of these errors when trying to iterate. slurp_info = [] else: slurp_info = [] # searchterm is an unexpected type else: raise TypeError, "search_term_ must be a dict or a list of dicts, not a {}".format( type(search_term_)) return slurp_info def existing_and_missing_uni(self, searchterm_list): """ Takes a list of searchterm strings and returns a list of searchterm strings that were found in the db (in unicode) and a list of the searchterms that were missing from the found results """ # make sure in utf8 before we send request to the db input_sts_utf8 = [to_utf8_or_bust(i) for i in searchterm_list] found_sts_info = self.get_slurp_info(input_sts_utf8) found_sts_uni = [ to_unicode_or_bust(dict(i)['searchterm']) for i in found_sts_info ] input_sts_uni = [to_unicode_or_bust(i) for i in input_sts_utf8] missing_sts_uni = order_conserving.setdiff(input_sts_uni, found_sts_uni) return found_sts_uni, missing_sts_uni def get_table(self, table_name): """ Convenience method for client who may wish to get a specific table from the DynamoDB connection """ return Table(table_name, connection=self.connection) def truncate_failed_slurp_table(self): """ """ with self.failed_slurps_table.batch_write() as batch: for item in self.failed_slurps_table.scan(): batch.delete_item(searchterm=item['searchterm']) def truncate_slurp_table(self): """ WARNING! Only use for test mode table """ assert self.slurps_table.table_name == 'test_slurps', "Will only truncate test slurps table. To truncate production table, run code manually" test_slurps_table = Table('test_slurps', connection=self.connection) with test_slurps_table.batch_write() as batch: for item in self.slurps_table.scan(): batch.delete_item(searchterm=item['searchterm']) def modify_failed_slurps_throughput(self, requested_read, requested_write): return self.modify_throughput(requested_read, requested_write, self.failed_slurps_table) def modify_slurps_throughput(self, requested_read, requested_write): return self.modify_throughput(requested_read, requested_write, self.slurps_table) def get_slurps_table_info(self): return self.get_table_info(self.slurps_table) def get_failed_slurps_table_info(self): return self.get_table_info(self.failed_slurps_table)
########################### ## Batch Reading ########################### # Similar to batch writing, batch reading can also help reduce the number of API # requests necessary to access a large number of items. from boto.dynamodb2.table import Table users = Table('users2', connection=conn) many_users = users.batch_get(keys=[ { 'account_type': 'standard_user', 'last_name': 'Doe' }, { 'account_type': 'standard_user', 'last_name': 'Doering' }, { 'account_type': 'super_user', 'last_name': 'Liddel' }, ]) for user in many_users: print user['first_name'] ########################### ## Table Scan ########################### from boto.dynamodb2.table import Table users = Table('users2', connection=conn) scan_results = users.scan(birthday__ne='null', limit=50)
def __init__(self, term, database, score): # create variables for storing data of interest self.lexdiv = [] self.subjectivity = [] self.polarity = [] # put both maxdate and mindate in formats that can be compared to later with dates outside the range self.origmaxdate = arrow.now().replace(years=-5) self.maxdate = self.origmaxdate.format("YYYY-MM-DD HH:mm") self.origmindate = arrow.now() self.mindate = self.origmindate.format("YYYY-MM-DD HH:mm") self.articleIDs = [] self.term = term self.source = database self.score = score self.articleIDsub = [] self.IDtracker = [] self.dayscount = {} # tables times are stored differently. This dictionary contains all possibilties and the correct one is selected using # self.source as a key self.dtstringdict = { "DailyBeast": "MM-DD-YY h:mm", "CNN": "MMM-D-YYYY h:mm", "Huffington_Post": "YYYY-MM-DD HH:mm:ss", "Washington_Post": "YYYY-MM-DD HH:mm:ss", "RollingStone": "YYYY-MM-DD HH:mm:ssZZ", "SF_Gate": "YYYY-MM-DD HH:mm:ss", } self.dtstring = self.dtstringdict[self.source] # table indices are stored differently. This dictionary contains all possibilties and the correct one is selected using # self.source as a key self.indexdict = { "DailyBeast": "tstamp", "CNN": "tstamp", "Huffington_Post": "id", "Washington_Post": "id", "RollingStone": "id", "SF_Gate": "id", } self.index = self.indexdict[self.source] # the name of columns that have dates are different. This dictionary selects the correct one. self.dtdict = { "DailyBeast": "datetime", "CNN": "datetime", "Huffington_Post": "date", "Washington_Post": "date", "RollingStone": "date", "SF_Gate": "date", } self.dtstr = self.dtdict[self.source] # These data sources were set up without a RangeKey on Dyanmo. Therefore to use batch get, there is a slightly # different syntax that needs to be used. A later if/then statement does thsis. self.norangesources = ["Huffington_Post", "Washington_Post", "RollingStone", "SF_Gate"] # connect to dynamodb conn = boto.dynamodb2.connect_to_region("us-west-2", aws_access_key_id="", aws_secret_access_key="") # Create table variables. table = Table(self.source, connection=conn) querytable = Table("IndexTdIdf", connection=conn) # Collect all articleIDs from IndexTDIDF table that match the term given and the desired score # puts in dictionaries of 100 for batch get in a list for row in querytable.scan(): if ( row["word"] == self.term and row["source"] == self.source and row["tdIdfRoundTo7"] > self.score and int(float(row["articleId"])) not in self.IDtracker ): # this is the code where some take hash and range key and others just hash key to do batch get if self.source in self.norangesources: dictvalue = {self.index: int(float(row["articleId"]))} else: dictvalue = {self.index: int(float(row["articleId"])), "source": self.source} self.articleIDsub.append(dictvalue) self.IDtracker.append(int(float(row["articleId"]))) if len(self.articleIDsub) == 100: self.articleIDs.append(self.articleIDsub) self.articleIDsub = [] else: continue # make to append any left over that is not in a group of 100 self.articleIDs.append(self.articleIDsub) for entry in self.articleIDs: results = table.batch_get(keys=entry) for result in results: if result[self.dtstr] is not None or "": try: tm = arrow.get(result[self.dtstr], self.dtstring) tm = tm.format("YYYY-MM-DD") except: continue if tm in self.dayscount.keys(): self.dayscount[tm] += 1 else: self.dayscount[tm] = 1 if tm > self.maxdate: self.maxdate = result[self.dtstr] if tm < self.mindate: self.mindate = result[self.dtstr] if (result["lexical diversity"]) is not None or 0: self.lexdiv.append(result["lexical diversity"]) if (result["subjectivity"]) is not None or 0: self.subjectivity.append(result["subjectivity"]) if (result["polarity"]) is not None or 0: self.polarity.append(result["polarity"])
def __init__(self, term, database, score): #create variables for storing data of interest self.lexdiv = [] self.subjectivity = [] self.polarity = [] #put both maxdate and mindate in formats that can be compared to later with dates outside the range self.origmaxdate = arrow.now().replace(years=-5) self.maxdate = self.origmaxdate.format('YYYY-MM-DD HH:mm') self.origmindate = arrow.now() self.mindate = self.origmindate.format('YYYY-MM-DD HH:mm') self.articleIDs = [] self.term = term self.source = database self.score = score self.articleIDsub = [] self.IDtracker = [] self.dayscount = {} #tables times are stored differently. This dictionary contains all possibilties and the correct one is selected using #self.source as a key self.dtstringdict = { 'DailyBeast': "MM-DD-YY h:mm", 'CNN': "MMM-D-YYYY h:mm", "Huffington_Post": "YYYY-MM-DD HH:mm:ss", "Washington_Post": "YYYY-MM-DD HH:mm:ss", "RollingStone": "YYYY-MM-DD HH:mm:ssZZ", "SF_Gate": "YYYY-MM-DD HH:mm:ss" } self.dtstring = self.dtstringdict[self.source] #table indices are stored differently. This dictionary contains all possibilties and the correct one is selected using #self.source as a key self.indexdict = { 'DailyBeast': "tstamp", 'CNN': "tstamp", "Huffington_Post": "id", "Washington_Post": "id", "RollingStone": "id", "SF_Gate": "id" } self.index = self.indexdict[self.source] #the name of columns that have dates are different. This dictionary selects the correct one. self.dtdict = { 'DailyBeast': "datetime", 'CNN': "datetime", "Huffington_Post": "date", "Washington_Post": "date", "RollingStone": "date", "SF_Gate": "date" } self.dtstr = self.dtdict[self.source] #These data sources were set up without a RangeKey on Dyanmo. Therefore to use batch get, there is a slightly #different syntax that needs to be used. A later if/then statement does thsis. self.norangesources = [ "Huffington_Post", "Washington_Post", "RollingStone", "SF_Gate" ] #connect to dynamodb conn = boto.dynamodb2.connect_to_region('us-west-2', aws_access_key_id='', aws_secret_access_key='') #Create table variables. table = Table(self.source, connection=conn) querytable = Table('IndexTdIdf', connection=conn) #Collect all articleIDs from IndexTDIDF table that match the term given and the desired score #puts in dictionaries of 100 for batch get in a list for row in querytable.scan(): if row["word"] == self.term and row[ "source"] == self.source and row[ "tdIdfRoundTo7"] > self.score and int( float(row["articleId"])) not in self.IDtracker: #this is the code where some take hash and range key and others just hash key to do batch get if self.source in self.norangesources: dictvalue = {self.index: int(float(row["articleId"]))} else: dictvalue = { self.index: int(float(row["articleId"])), 'source': self.source } self.articleIDsub.append(dictvalue) self.IDtracker.append(int(float(row["articleId"]))) if len(self.articleIDsub) == 100: self.articleIDs.append(self.articleIDsub) self.articleIDsub = [] else: continue #make to append any left over that is not in a group of 100 self.articleIDs.append(self.articleIDsub) for entry in self.articleIDs: results = table.batch_get(keys=entry) for result in results: if result[self.dtstr] is not None or '': try: tm = arrow.get(result[self.dtstr], self.dtstring) tm = tm.format('YYYY-MM-DD') except: continue if tm in self.dayscount.keys(): self.dayscount[tm] += 1 else: self.dayscount[tm] = 1 if tm > self.maxdate: self.maxdate = result[self.dtstr] if tm < self.mindate: self.mindate = result[self.dtstr] if (result['lexical diversity']) is not None or 0: self.lexdiv.append(result['lexical diversity']) if (result["subjectivity"]) is not None or 0: self.subjectivity.append(result["subjectivity"]) if (result["polarity"]) is not None or 0: self.polarity.append(result["polarity"])
class DynamoDBAdapter(key_value_store.KeyValueStore): """ Implementation of an abstract key-value store defined in key_value_store.py. The underlying database is amazon DynamoDB. The store keeps all objects in a single table with following schema: [HashKey('kind', data_type=STRING), RangeKey('id')]. 'kind' is the string with the object type ('vector', 'set' or 'int') and 'id' is the object id. The object value is stored in the 'value' attribute of the table items. The table should be created before this code is executed. Amazon configuration is assumed to be stored in ~/.boto file as described in http://boto.readthedocs.org/en/latest/boto_config_tut.html """ def __init__(self, precision=np.dtype('float32'), table_name='test'): """ Create an instance of the dynamodb key-value store. precision - a numpy type, elements of all vectors are converted and stored in this type; table_name - the name of the DynamoDB table which keeps the objects. """ conn = boto.dynamodb2.connect_to_region('eu-west-1') if not isinstance(precision, np.dtype): raise TypeError("Precision should be a numpy.dtype subtype") self.precision = precision self.precision_name = precision.name self.table = Table(table_name, connection=conn) def _get_or_create_item(self, kind, item_id): try: item = self.table.get_item(kind=kind, id=item_id) except ItemNotFound: item = Item(self.table) item['kind'] = kind item['id'] = item_id return item def _create_vector_item(self, vec_id, vector): item = self._get_or_create_item('vector', vec_id) item['value'] = Binary(vector.astype(self.precision).tostring()) item['precision'] = self.precision_name return item def _vector_value(self, item): return np.fromstring(str(item['value']), np.dtype(item['precision'])) def get_vector_ids(self): return [v['id'] for v in self.table.query_2(kind__eq='vector')] def get_int_ids(self): return [v['id'] for v in self.table.query_2(kind__eq='int')] def get_set_ids(self): return [v['id'] for v in self.table.query_2(kind__eq='set')] def store_vector(self, vec_id, vector): item = self._create_vector_item(vec_id, vector) item.save() def get_vector(self, vec_id): try: item = self.table.get_item(kind='vector', id=vec_id) except ItemNotFound: raise KeyError('Vector key %s is unknown' % (vec_id, )) return self._vector_value(item) def bulk_get_vector(self, vec_ids): keys = [{'kind': 'vector', 'id': i} for i in vec_ids] vs = self.table.batch_get(keys=keys) return [self._vector_value(i) for i in vs] def remove_vector(self, vec_id): try: item = self.table.get_item(kind='vector', id=vec_id) except ItemNotFound: raise KeyError('Vector key %s is unknown' % (vec_id, )) item.delete() def add_to_set(self, set_id, element_id): item = self._get_or_create_item('set', set_id) if 'value' not in item.keys() or not isinstance(item['value'], set): item['value'] = set() item['value'].add(element_id) item.save(overwrite=True) def remove_from_set(self, set_id, element_id): try: item = self.table.get_item(kind='set', id=set_id) except ItemNotFound: raise KeyError('Set key %s is unknown' % (set_id, )) if 'value' not in item.keys() or not isinstance(item['value'], set): raise KeyError('Incorrect value in item %s' % (set_id, )) if element_id not in item['value']: raise KeyError('Element %s not in set %s' % (element_id, set_id)) item['value'].remove(element_id) item.save() def remove_set(self, set_id): try: item = self.table.get_item(kind='set', id=set_id) item.delete() except ItemNotFound: raise KeyError('Set key %s is unknown' % (set_id, )) def get_set(self, set_id): try: the_set = self.table.get_item(kind='set', id=set_id)['value'] return set([str(entry) for entry in the_set]) except ItemNotFound: raise KeyError('Set key %s is unknown' % (set_id, )) def store_int(self, int_id, integer): item = self._get_or_create_item('int', int_id) item['value'] = integer item.save() def get_int(self, int_id): try: return int(self.table.get_item(kind='int', id=int_id)['value']) except ItemNotFound: raise KeyError('Int key %s is unknown' % (int_id, )) def remove_int(self, int_id): try: item = self.table.get_item(kind='int', id=int_id) except ItemNotFound: raise KeyError('Int key %s is unknown' % (int_id, )) item.delete() def _aggregate_set_id_element_pairs(self, setpairs): """Turns a list of pairs of the form (set_id, element_id) into a list 'L' of pairs 'p' of the form (set_id, set_of_element_ids). 'L' has the property that if 'p' and 'q' are distinct entries in 'L', then p[0] and q[0] are also distinct.""" set_ids = set([entry[0] for entry in setpairs]) listlist = [[entry for entry in setpairs if entry[0] == set_id] for set_id in set_ids] result = [(pairlist[0][0], set([entry[1] for entry in pairlist])) for pairlist in listlist] return result def bulk_store_vector(self, vec_ids, vectors): if len(vec_ids) != len(vectors): raise ValueError vecpairs = zip(vec_ids, vectors) with self.table.batch_write() as batch: for vec_id, vec in vecpairs: item = self._create_vector_item(vec_id, vec) batch.put_item(item) def bulk_store_vector_old(self, vectors_df): """Argument 'vectors' is a dataframe with index vector ids.""" if len(vec_ids) != len(vectors): raise ValueError with self.table.batch_write() as batch: for ind in vectors_df.index: vec_id = str(ind) vec = vectors_df.loc[ind].values item = self._create_vector_item(vec_id, vec) batch.put_item(item) def bulk_store_int(self, int_ids, integers): """Argument 'intpairs' is a list of pairs of the form (int_id, integer).""" if len(int_ids) != len(integers): raise ValueError intpairs = zip(int_ids, integers) with self.table.batch_write() as batch: for pair in intpairs: int_id, integer = pair item = self._get_or_create_item('int', int_id) item['value'] = integer batch.put_item(item) def bulk_add_to_set(self, set_ids, element_ids): """batch_write() objects if the same item is written to more than once per batch, hence we aggregate all (set_id, element_id) pairs into a list of pairs (set_id, element_ids), where the 'set_id's are pairwise distinct, and the 'element_ids' are sets.""" if len(set_ids) != len(element_ids): raise ValueError setpairs = zip(set_ids, element_ids) setlist = self._aggregate_set_id_element_pairs(setpairs) with self.table.batch_write() as batch: for pair in setlist: set_id, element_ids = pair item = self._get_or_create_item('set', set_id) if 'value' not in item.keys() or not isinstance( item['value'], set): item['value'] = set() item['value'].update(element_ids) batch.put_item(item)
class DDBSlurps(Dynamo): @classmethod def from_test_mode(cls, access_key=None, secret=None): """ Use this for getting an instance of this class that uses test tables. """ instance = cls(access_key, secret) instance.slurps_table = Table('test_slurps', connection=instance.connection) instance.failed_slurps_table = Table('test_failed_slurps', connection=instance.connection) return instance def __init__(self, access_key=None, secret=None): """ ! Use test_mode factory method for instantiating this class with test_slurps and test_failed_slurps tables """ super(DDBSlurps, self).__init__(access_key, secret) self.slurps_table = Table('slurps', connection=self.connection) self.failed_slurps_table = Table('failed_slurps', connection=self.connection) def save_slurp_info(self, slurp_info_, overwrite=True): """ slurp_info_ can either be in the form of a list of dicts or else a single dict. If slurp_info is a list, batch write will be used """ if isinstance(slurp_info_, dict): self.slurps_table.put_item(slurp_info_, overwrite=overwrite) elif isinstance(slurp_info_, list): with self.slurps_table.batch_write() as batch: for s in slurp_info_: batch.put_item(data=s, overwrite=overwrite) else: raise TypeError, "slurp_info must be a dict or a list of dicts, not a {}".format(type(slurp_info_)) def save_failed_slurp(self, searchterm): self.failed_slurps_table.put_item(data={'searchterm': searchterm, 'datetime': datetime.now().isoformat()}, overwrite=True) def get_slurp_info(self, search_term_=None): """ search_term_ can be either a string or a list of strings. Each string should be a search term you are looking for in the db. Returns either a single list of key-value tuples (if search_term_ was a string) or a list of key-value tuples (if search_term_ was a list) Each list of key-value tuples can easily be converted to a dict or an OrderedDict by the client. """ # searchterm_ is a STRING if isinstance(search_term_, basestring): if search_term_: slurp_info = (self.slurps_table.get_item(searchterm=search_term_)).items() else: slurp_info = [] # searchterm is a LIST of strings elif isinstance(search_term_, list): if search_term_: # create a set of non-empty searchterms. We us a set to avoid a duplicate query error from the db set_of_sts = {st for st in search_term_ if st} # create a list of dicts from the set list_of_st_dicts = [{'searchterm': st} for st in set_of_sts] res = self.slurps_table.batch_get(list_of_st_dicts) try: slurp_info = [i.items() for i in res] except (StopIteration, IndexError): # If res is empty, we get one of these errors when trying to iterate. slurp_info = [] else: slurp_info = [] # searchterm is an unexpected type else: raise TypeError, "search_term_ must be a dict or a list of dicts, not a {}".format(type(search_term_)) return slurp_info def existing_and_missing_uni(self, searchterm_list): """ Takes a list of searchterm strings and returns a list of searchterm strings that were found in the db (in unicode) and a list of the searchterms that were missing from the found results """ # make sure in utf8 before we send request to the db input_sts_utf8 = [to_utf8_or_bust(i) for i in searchterm_list] found_sts_info = self.get_slurp_info(input_sts_utf8) found_sts_uni = [to_unicode_or_bust(dict(i)['searchterm']) for i in found_sts_info] input_sts_uni = [to_unicode_or_bust(i) for i in input_sts_utf8] missing_sts_uni = order_conserving.setdiff(input_sts_uni, found_sts_uni) return found_sts_uni, missing_sts_uni def get_table(self, table_name): """ Convenience method for client who may wish to get a specific table from the DynamoDB connection """ return Table(table_name, connection=self.connection) def truncate_failed_slurp_table(self): """ """ with self.failed_slurps_table.batch_write() as batch: for item in self.failed_slurps_table.scan(): batch.delete_item(searchterm=item['searchterm']) def truncate_slurp_table(self): """ WARNING! Only use for test mode table """ assert self.slurps_table.table_name == 'test_slurps', "Will only truncate test slurps table. To truncate production table, run code manually" test_slurps_table = Table('test_slurps', connection=self.connection) with test_slurps_table.batch_write() as batch: for item in self.slurps_table.scan(): batch.delete_item(searchterm=item['searchterm']) def modify_failed_slurps_throughput(self, requested_read, requested_write): return self.modify_throughput(requested_read, requested_write, self.failed_slurps_table) def modify_slurps_throughput(self, requested_read, requested_write): return self.modify_throughput(requested_read, requested_write, self.slurps_table) def get_slurps_table_info(self): return self.get_table_info(self.slurps_table) def get_failed_slurps_table_info(self): return self.get_table_info(self.failed_slurps_table)