def compute_sam_match_query(channel): """ Compute the query which would match all speech acts for the given channels list, in the timeslot interval (from - to)""" from solariat.db import fields from solariat_bottle.utils.id_encoder import BIGGEST_POST_VALUE, TIMESLOT_WIDTH from solariat_bottle.db.speech_act import SpeechActMap, pack_speech_act_map_id from_timeslot = 0 << TIMESLOT_WIDTH to_timeslot = (1L << TIMESLOT_WIDTH) - 1 to_binary = fields.BytesField().to_mongo match_query_base = [] for status in SpeechActMap.STATUS_NAME_MAP.keys(): # compute id bounds for all posts for this slot id_lower_bound = pack_speech_act_map_id(channel, status, from_timeslot, 0) id_upper_bound = pack_speech_act_map_id(channel, status, to_timeslot, BIGGEST_POST_VALUE) match_query_base.append({ '_id': { "$gte": to_binary(id_lower_bound), "$lte": to_binary(id_upper_bound) } }) day_speech_act_filter = {"$or": match_query_base} return day_speech_act_filter
def make_id(cls, channel, status, time_slot): channel_num = channel.counter assert isinstance( channel_num, (int, long)), \ 'channel.counter must be an integer: %r' % channel_num # noqa to_binary = fields.BytesField().to_mongo return to_binary(pack_short_stats_id(channel_num, status, time_slot))
class StreamLog(Document): """Created on streamref creation, updated on stream stops""" accounts = fields.ListField(fields.ObjectIdField()) channels = fields.ListField(fields.ObjectIdField()) stream_ref_id = fields.BytesField() started_at = fields.DateTimeField(null=True) stopped_at = fields.DateTimeField(null=True) indexes = [('accounts', ), ('channels', ), ('stream_ref_id', )]
class StreamRef(Document): QUEUED = 'queued' RUNNING = 'running' ERROR = 'error' STOPPED = 'stopped' STREAM_STATUSES = [QUEUED, RUNNING, ERROR, STOPPED] id = fields.BytesField(db_field='_id', unique=True, required=True) track = fields.ListField(fields.StringField()) follow = fields.ListField(fields.StringField()) # user_ids languages = fields.ListField(fields.StringField(), db_field='lng') status = fields.StringField(choices=STREAM_STATUSES) log = fields.ReferenceField('StreamLog') manager = StreamRefManager indexes = [('status', )] def is_stopped(self): return self.status == self.STOPPED or (self.log and self.log.stopped_at is not None) @property def key(self): if not self.id: footprint = self.filters self.id = mhash(footprint, n=128) return self.id @property def filters(self): return tuple( [freeze(self.track), freeze(self.follow), freeze(self.languages)]) def set_added(self): self.update(status=self.RUNNING) self.log.update(started_at=now()) def set_removed(self): self.update(status=self.STOPPED) self.log.update(stopped_at=now()) def save(self, **kw): self.id = self.key # fill hash id super(StreamRef, self).save(**kw)
from solariat.db import fields from solariat_bottle.settings import LOGGER from solariat_bottle.db.channel_trends import ChannelTrendsManager from solariat_bottle.db.channel_stats_base import ChannelTrendsBase, EmbeddedStats from solariat_bottle.utils.id_encoder import pack_conversation_stats_id, unpack_conversation_stats_id from solariat_bottle.db.channel_stats_base import ALL_AGENTS, CountDict to_binary = fields.BytesField().to_mongo class ConversationEmbeddedStats(EmbeddedStats): # Metrics count = fields.NumField(db_field='cn', default=0) countable_keys = ['count'] # def __hash__(self): # return hash(self.agent) def __str__(self): return "|agent=%s;count=%s|" % (self.agent, self.count) class ConversationTrends(ChannelTrendsBase): """ Base class for all conversation trends. has allow_inheritance set to True """ manager = ChannelTrendsManager
class ChannelStatsBase(Document): """ Base class for trend and topic stats. """ version = fields.NumField(db_field='_v') id = fields.BytesField(db_field='_id', unique=True, required=True) time_slot = fields.NumField(default=0, db_field='ts') gc_counter = fields.NumField(db_field='g') channel_ts = fields.BytesField(db_field='ct') indexes = [('gc_counter')] def channel_ts_from_id(self, data_id): """ From a document id compute a channel ts """ channel_num, _, _, time_slot = unpack_stats_id(to_python(data_id)) return self.make_channel_ts(channel=channel_num, time_slot=time_slot) @classmethod def make_channel_ts(cls, channel, time_slot): channel_num = get_channel_num(channel) res = pack_components( (channel_num, CHANNEL_WIDTH), (time_slot, TIMESLOT_WIDTH), ) return res @property def EmbeddedStatsCls(self): return self.fields['embedded_stats'].field.doc_class @property def _query(self): raise AppException( 'unimplemented method, to be overrided in a subclass') def prepare_update_query(self, item_id, item_topic): """ Genereate the update query for all the embedded stats. Also return the item version so that we can do optimistic locking on stats update. This is needed because we are setting new embedded stats every time. """ item = self.objects.find_one(id=item_id) if item: # In case of hash collision, this should make sure we are not retrying again # and again. # item_topic can be None for simple trends if not item_topic is None: assert item.topic == item_topic, u"Collision '%s' '%s'" % ( item.topic, item_topic) version = item.version if item.version else DEFAULT_NEW_VERSION existing_embedded_stats = item.embedded_stats else: version = DEFAULT_NEW_VERSION existing_embedded_stats = [] new_embedded_stats = self.compute_new_embeded_stats() # Generate and updated list based on in memory entries and # the existing entries from the database. updated_list = self.EmbeddedStatsCls.update_list( existing_embedded_stats, new_embedded_stats) self._upsert_data = { "$set": { self.name2db_field('embedded_stats'): updated_list } } return version def upsert(self, w=1): # Try 5 times just to make it safe for conflicts. if self.stats_upsert(max_tries=5): return True return False def get_expected_topic(self, query): """ In the simple reports we expect no topic. """ return None def stats_upsert(self, max_tries=4, logger=LOGGER): """Used in upsert() method for documents with embedded stats list. Returns True if document has been successfully saved within `max_tries` iterations, else False. """ _v = self.name2db_field('version') find_query = self._query # remove 'gc_counter' if exists find_query.pop(self.name2db_field('gc_counter'), None) # simple trends do not have topics item_topic = self.get_expected_topic(find_query) item_id = find_query["_id"] nr_of_tries = 0 while nr_of_tries < max_tries: nr_of_tries += 1 try: version = self.prepare_update_query(item_id, item_topic) except AssertionError, e: logger.warning( u"Topic hashing collision. Stats not updated! \nfind query=%s\nitem topic=%s\n%s" % (find_query, item_topic, e)) return False # Increment version using $set to be more robust to new documents self._upsert_data["$set"][_v] = version + 1 if version > DEFAULT_NEW_VERSION: # If it's an update, just look by id and version, nothing else really matters find_query = {_v: version, "_id": find_query["_id"]} else: # On new documents, set the default version and use whole find query so upsert # generates a document with whole data find_query[_v] = version try: assert '_id' in find_query, 'unique id required' assert '_v' in find_query, 'version required' self.objects.coll.update(find_query, self._upsert_data, upsert=True, w=1) return True except AssertionError, e: logger.error( u"Find query needs at very least _id and _v. Instead got: %s" % str(find_query)) return False except DuplicateKeyError, e: # This is just part of our optimistic lock and can fail a lot especially for high # traffic channels, so we should not consider it an error since it just makes actual # error tracking in logs harder to do. if 2 <= nr_of_tries <= 3: # We already tried 2 or 3 times, it might be an actual problem LOGGER.warning( "channel stats locking: collision %s times in a row. id=%r", nr_of_tries, find_query['_id']) elif nr_of_tries >= 4: # We already tried 4 times, something is definitely wrong LOGGER.error( u"channel stats locking: collision repeated %s times. Find query=%s, Upsert=%s" % (nr_of_tries, find_query, self._upsert_data)) # If we just had an optimistic lock fail, sleep for a random period # until trying again. delay_sec = max(0.01, normalvariate(0.1, 0.03)) LOGGER.debug( 'channel stats locking: waiting for %.2f sec after a collision', delay_sec) sleep(delay_sec)
from solariat.db.abstract import SonDocument, Document from solariat.utils.lang.support import Lang, get_lang_code from solariat_nlp.sa_labels import ALL_INTENTIONS, SATYPE_ID_TO_NAME_MAP from solariat_bottle.settings import LOGGER, AppException, get_var from solariat_bottle.db.channel.base import Channel from solariat_bottle.utils.id_encoder import (pack_stats_id, unpack_stats_id, pack_components, get_channel_num, CHANNEL_WIDTH, TIMESLOT_WIDTH) ALL_AGENTS = 0 ANONYMOUS_AGENT_ID = -1 DEFAULT_NEW_VERSION = 0 ALL_INTENTIONS_INT = int(ALL_INTENTIONS.oid) to_python = fields.BytesField().to_python to_mongo = fields.BytesField().to_mongo to_binary = to_mongo def conversation_closed(conversation, closing_time, quality): from solariat_bottle.tasks.stats import update_conversation_stats if get_var('ON_TEST'): update_conversation_stats(conversation, closing_time, quality) else: update_conversation_stats.ignore(conversation, closing_time, quality) def post_created(post, **context): # Avoid circular imports. TODO: We should have a clearer package dep chain. from solariat_bottle.utils.stats import _update_channel_stats
class DataExport(ArchivingAuthDocument): STATES = dict(CREATED=0, FETCHING=1, FETCHED=2, GENERATING=3, GENERATED=4, SENDING=5, SENT=6, SUCCESS=7, ERROR=8, CANCELLED=9) State = enum(**STATES) account = fields.ReferenceField('Account', db_field='at') created_by = fields.ReferenceField('User', db_field='ur') recipients = fields.ListField(fields.ReferenceField('User'), db_field='rs') recipient_emails = fields.ListField(fields.StringField(), db_field='rse') state = fields.NumField(choices=STATES.values(), default=State.CREATED, db_field='se') created_at = fields.DateTimeField(db_field='ct', default=now) _input_filter = fields.DictField(db_field='ir') input_filter_hash = fields.BytesField(db_field='irh') states_log = fields.ListField(fields.DictField(), db_field='sg') indexes = [('acl', 'input_filter_hash')] manager = DataExportManager def set_input_filter(self, data): self._input_filter = data self.input_filter_hash = hash_dict(data) input_filter = property(lambda self: self._input_filter, set_input_filter) def _log_state_change(self, from_state, to_state, extra_info): doc = {"from": from_state, "to": to_state, "ts": now()} if extra_info: doc["info"] = extra_info self.states_log.append(doc) return {"push__states_log": doc} def change_state(self, new_state, **kwargs): current_state = self.state assert \ new_state in {self.State.ERROR, self.State.CANCELLED} \ or new_state - current_state <= 2, \ "Cannot switch to state %s from state %s" % ( new_state, current_state) self.state = new_state update_dict = self._log_state_change(current_state, new_state, kwargs) update_dict.update(set__state=new_state) self.update(**update_dict) def to_json(self, fields_to_show=None): data = super(DataExport, self).to_json(fields_to_show=('id', 'input_filter_hash', 'state', 'created_at')) data['input_filter_hash'] = str(data['input_filter_hash']) return data def process(self, user, params=None): state = self.change_state S = DataExport.State initial_args = user, params pipeline = [(S.FETCHING, fetch_posts), (S.GENERATING, PostsCsvGenerator.generate_csv), (None, create_zip_attachments), (S.SENDING, DataExportMailer(self).send_email)] try: args = initial_args for step, command in pipeline: step and state(step) result = command(*args) if not isinstance(result, tuple): args = (result, ) else: args = result state(S.SUCCESS) except Exception as exc: state(S.ERROR, exception=unicode(exc)) raise exc