예제 #1
0
    def test_stats_retrieving(self):
        time_slot = datetime_to_timeslot(now(), 'day')
        topics = ('laptop', 'laptop bag', 'good laptop bag', 'good laptop')
        for topic in topics:
            for term, is_leaf in gen_topic_tree(topic):
                ChannelHotTopics.increment(self.channel,
                                           time_slot,
                                           term,
                                           status=0,
                                           intention_id=0,
                                           is_leaf=is_leaf,
                                           lang_id=Lang.EN,
                                           agent=1)

        stats = ChannelHotTopics.objects.by_time_span(
            channel=self.channel,
            from_ts=datetime_to_timeslot(None, 'day'),
            languages=['en'])

        expected_result = [{
            u'term_count': 2,
            u'topic': u'laptop',
            u'topic_count': 1
        }, {
            u'term_count': 2,
            u'topic': u'bag',
            u'topic_count': 0
        }]

        self.assertListEqual(stats, expected_result)
예제 #2
0
 def test_discard_all(self):
     ''' Creat the posts way in the past and make sure we drop them all'''
     before = ChannelHotTopics.objects().count()
     DAY_10022011 = pytz.utc.localize(datetime(day=2, month=10, year=2011))
     self._make_laptops_and_icecream(DAY_10022011)
     purge_stats(self.channel)
     after = ChannelHotTopics.objects().count()
     self.assertEqual(after, before)
예제 #3
0
 def test_leave_months_only(self):
     before = ChannelHotTopics.objects().count()
     DAY_TWO_WEEKS_AGO = now() - timedelta(days=15)
     self._make_laptops_and_icecream(DAY_TWO_WEEKS_AGO)
     delta = ChannelHotTopics.objects().count() - before
     purge_stats(self.channel)
     after = ChannelHotTopics.objects().count() - before
     # The days should be all gone
     self.assertEqual(after, delta / 2)
예제 #4
0
def get_document_ids(channel_or_tag, time_slot, topics):
    from ..db.speech_act import SpeechActMap

    for status in SpeechActMap.STATUS_NAME_MAP.keys():
        for topic in topics:
            yield ChannelHotTopics.make_id(channel_or_tag, time_slot, topic,
                                           status)
예제 #5
0
    def test_number_of_parentless(self):
        content = "I need a mac laptop"

        # 2 unigrams per level
        self._create_db_post(content)

        leaf_stats = ChannelHotTopics.objects(hashed_parents=[])
        self.assertEqual(len(leaf_stats), 1 * 2)  # (laptop) x (day + month)
예제 #6
0
    def test_docs_from_topics(self):
        '''
        Make sure we can get the ids we want and can fetch the docs for them
        '''
        self._make_laptops_and_icecream()
        doc_ids = [
            x for x in get_document_ids(self.channel, self.this_month,
                                        ['laptop', 'cream'])
        ]

        items = ChannelHotTopics.objects(id__in=doc_ids)
        self.assertEqual(set([item.topic for item in items]),
                         set(['laptop', 'cream']))
예제 #7
0
 def _store_existing_data(self):
     # Keep track of what was in database when this was called
     self.ctt = {}
     self.ctt_bk = {}
     self.cht = {}
     self.ct = {}
     self.ctt_count = ChannelTopicTrends.objects.count()
     self.cht_count = ChannelHotTopics.objects.count()
     self.ct_count = ChannelTrends.objects.count()
     for ctt in ChannelTopicTrends.objects():
         self.ctt_bk[ctt.data['_id']] = ctt.data
         self.ctt[ctt.data['_id']] = self._process_es(ctt)
     for cht in ChannelHotTopics.objects():
         self.cht[cht.data['_id']] = self._process_es(cht)
     for ct in ChannelTrends.objects():
         self.ct[ct.data['_id']] = self._process_es(ct)
예제 #8
0
 def _compare_existing_data(self):
     # Compare what is currently in database with what we have stored
     for ctt in ChannelTopicTrends.objects():
         for data in ctt.data['es']:
             keys = tuple(sorted(data.keys()))
             values = tuple(sorted(data.values()))
             self.assertTrue((keys, values) in self.ctt[ctt.data['_id']])
     for cht in ChannelHotTopics.objects():
         for data in cht.data['es']:
             keys = tuple(sorted(data.keys()))
             values = tuple(sorted(data.values()))
             self.assertTrue((keys, values) in self.cht[cht.data['_id']])
     for ct in ChannelTrends.objects():
         for data in ct.data['es']:
             keys = tuple(sorted(data.keys()))
             values = tuple(sorted(data.values()))
             self.assertTrue((keys, values) in self.ct[ct.data['_id']])
예제 #9
0
def _update_monthly_cht_values(channel, from_date_end, to_date_end, topics):
    """ Do upsert on monthly values based on the daily values.
    """
    from solariat.utils.timeslot import datetime_to_timeslot
    from solariat_bottle.utils.id_encoder import get_topic_hash
    from solariat_nlp.utils.topics import get_subtopics

    from solariat_bottle.db.speech_act import SpeechActMap
    from solariat_bottle.db.channel_hot_topics import ChannelHotTopics
    from solariat_bottle.db.channel_topic_trends import ChannelTopicTrends
    from solariat_bottle.db.channel_stats_base import CountDict, batch_insert

    start_time = datetime.now()
    statuses = SpeechActMap.STATUS_NAME_MAP.keys()
    insertable_values = {}

    if not topics:
        logger.warning("No topics found for channel %s." % (channel.title, ))
        return

    month_intervals = _generate_day_level_ranges(from_date_end, to_date_end)
    for topic in topics:
        for from_date, to_date in month_intervals:
            or_query = []
            # $match query
            for topic, status in product([topic], statuses):
                from_id = ChannelTopicTrends.make_id(
                    channel, datetime_to_timeslot(from_date, 'day'), topic,
                    status)
                to_id = ChannelTopicTrends.make_id(
                    channel, datetime_to_timeslot(to_date, 'day'), topic,
                    status)
                or_query.append({"_id": {"$gte": from_id, "$lte": to_id}})

            if len(or_query) == 1:
                match_query = or_query[0]
            else:
                match_query = {"$or": or_query}

            pipeline = [{
                "$match": match_query
            }, {
                "$unwind": '$es'
            }, {
                '$group': {
                    '_id': {
                        'grp_at': '$es.at',
                        'grp_if': '$es.if',
                        'grp_in': '$es.in',
                        'grp_le': '$es.le',
                        'grp_tc': '$tc',
                        'grp_ss': '$ss'
                    },
                    'count': {
                        '$sum': '$es.tt'
                    }
                }
            }]
            month_level_counts = {}
            agreggation_result = ChannelHotTopics.objects.coll.aggregate(
                pipeline)
            if agreggation_result['ok']:
                for aggregated_count in agreggation_result['result']:
                    month_id = ChannelHotTopics.make_id(
                        channel=channel,
                        time_slot=datetime_to_timeslot(from_date, 'month'),
                        topic=aggregated_count['_id']['grp_tc'],
                        status=aggregated_count['_id']['grp_ss'])
                    if month_id in month_level_counts:
                        month_doc = month_level_counts[month_id]
                    else:
                        hashed_parents = map(
                            get_topic_hash,
                            get_subtopics(aggregated_count['_id']['grp_tc']))
                        month_doc = ChannelHotTopics(
                            channel=channel,
                            hashed_parents=hashed_parents,
                            time_slot=datetime_to_timeslot(from_date, 'month'),
                            topic=aggregated_count['_id']['grp_tc'],
                            status=aggregated_count['_id']['grp_ss'])
                        month_doc.version = 0
                        month_doc.embedded_dict = {}
                        month_level_counts[month_id] = month_doc

                    es_key = (aggregated_count['_id']['grp_at'],
                              aggregated_count['_id']['grp_if'],
                              aggregated_count['_id']['grp_in'],
                              aggregated_count['_id']['grp_le'])
                    # Default increment for all existign stats to 0, we will add to this later.
                    month_doc.embedded_dict[es_key] = CountDict(
                        {'topic_count': aggregated_count['count']})
                for key in month_level_counts:
                    insertable_values[key] = month_level_counts[key]
            else:
                logger.warning("Pipeline failed. Returned %s." %
                               agreggation_result)

    if insertable_values:
        ChannelHotTopics.objects.coll.remove(
            {'_id': {
                '$in': insertable_values.keys()
            }})
    batch_insert(insertable_values.values())
    logger.info("Integrating monthly level topics took: " +
                str(datetime.now() - start_time))
예제 #10
0
    def test_stat_update(self):
        Leaf = Topic = True
        Node = Term = False
        HELP = 10
        JUNK = 12
        EN = Lang.EN
        LALL = Lang.ALL

        time_slot = datetime_to_timeslot(now(), 'day')

        topic = 'laptop'
        agent_id = 12345
        hashed_parents = map(mhash, get_largest_subtopics(topic))

        stat = ChannelHotTopics(channel_num=self.channel.counter,
                                time_slot=time_slot,
                                topic=topic,
                                status=0,
                                hashed_parents=hashed_parents)

        stat.compute_increments(is_leaf=True,
                                intention_id=JUNK,
                                agent=None,
                                lang_id=Lang.EN,
                                n=1)
        stat.compute_increments(is_leaf=False,
                                intention_id=HELP,
                                agent=None,
                                lang_id=Lang.EN,
                                n=1)
        stat.upsert()
        stat = ChannelHotTopics.objects.get(id=stat.id)
        stat.compute_increments(is_leaf=True,
                                intention_id=JUNK,
                                agent=agent_id,
                                n=2)
        stat.upsert()

        stat.reload()

        expected_stats = [
            # agent | is_leaf | intent | language | topic_count
            (ALL_AGENTS, Term, ALL_INTENTIONS_INT, LALL, 1),
            (ALL_AGENTS, Term, ALL_INTENTIONS_INT, EN, 1),
            (ALL_AGENTS, Term, HELP, LALL, 1),
            (ALL_AGENTS, Term, HELP, EN, 1),
            (ALL_AGENTS, Topic, ALL_INTENTIONS_INT, LALL,
             1 + 2),  # +2 from specific agent
            (ALL_AGENTS, Topic, JUNK, LALL, 1 + 2),
            (ALL_AGENTS, Topic, JUNK, EN, 1),
            (ALL_AGENTS, Topic, ALL_INTENTIONS_INT, EN, 1),
            (agent_id, Topic, ALL_INTENTIONS_INT, LALL, 2),
            (agent_id, Topic, JUNK, LALL, 2)
        ]

        self.assert_stats(stat, expected_stats)

        self.assertFalse(stat.filter(agent=0, is_leaf=True,
                                     intention=10))  #no such combination
예제 #11
0
 def get_term_stats(term):
     query = {"topic": term, "hashed_parents": []}
     res = ChannelHotTopics.objects(**query)[:]
     assert len(res) == 2, res  # 2 for month and day timeslots
     return res[0]