Пример #1
0
 def test_extract_signature(self):
     up = self._make_agent('*****@*****.**', '^IZ', None)
     content = "You don't need an appointment.\n^IZ"
     self.assertEqual(
         Post(content=content,
              actor_id=up.id,
              is_inbound=False,
              _native_id='1').extract_signature(), '^IZ')
     content = "You don't need an appointment. ^IZ"
     self.assertEqual(
         Post(content=content,
              actor_id=up.id,
              is_inbound=False,
              _native_id='2').extract_signature(), '^IZ')
Пример #2
0
    def test_nonmatching_sentiments(self):
        """
        Posts, based on setup:

        (u'I need a bike. I like Honda.', Positive),
        (u'Can somebody recommend a sturdy laptop?', Neutral),
        (u'I need an affordabl laptop. And a laptop bag', Neutral),
        (u'Whatever you buy, let it be an Apple laptop', Neutral),
        (u'I would like to have a thin and lightweight laptop.', Neutral),
        (u'Thank you very much!', Positive),
        (u"You're gonna end up with a broken laptop", Negative)
        """
        from solariat_nlp.sentiment import extract_sentiment
        from solariat_bottle.db.post.base import Post
        print[(p.content, extract_sentiment(p.content)['sentiment'])
              for p in Post.objects()]
        trends = self.get_trends(
            **{
                'channel_id': str(self.channel.id),
                'from': self.one_day_before_str,
                'to': self.one_day_after_str,
                'level': 'hour',
                'topics': [{
                    'topic': 'laptop bag',
                    'topic_type': 'leaf'
                }],
                'sentiments': ['neutral'],
                'plot_type': 'sentiment',
                'group_by': 'status',
            })
        self.assertTrue(trends)  # we have some positive and negative examples
Пример #3
0
    def test_select_by_time_span_3(self):
        past_dt = now() - relativedelta(months=1)  # big enough for all levels

        post1 = self._create_db_post(_created=past_dt,
                                     content='i need some carrot')

        post2 = self._create_db_post(content='i need some carrot')

        self.assertEqual(
            Post.objects(channels__in=[self.channel.id]).count(), 2)

        for level in ('hour', 'day'):
            result = ChannelTopicTrends.objects.by_time_span(
                channel=self.channel,
                topic_pairs=[['carrot', True]],
                from_ts=datetime_to_timeslot(past_dt, level),
                to_ts=datetime_to_timeslot(None, level))
            self.assertEqual(len(result), 2)

            result = ChannelTopicTrends.objects.by_time_span(
                channel=self.channel,
                topic_pairs=[['carrot', True]],
                from_ts=datetime_to_timeslot(
                    past_dt + relativedelta(**{level + 's': 1}), level),
                to_ts=datetime_to_timeslot(None, level))
            self.assertEqual(len(result), 1)
Пример #4
0
def su_queue_view(user, channel_id):
    query = {}
    if channel_id != 'all':
        try:
            from solariat_bottle.utils.post import get_service_channel
            service_channel = get_service_channel(Channel.objects.get(channel_id))
            dispatch_channel = service_channel.get_outbound_channel(user)
        except Exception as exc:
            return jsonify(ok=False, channel_id=channel_id, error=str(exc))
        else:
            channel_ids = []
            if service_channel:
                channel_ids.append(str(service_channel.id))
            if dispatch_channel:
                channel_ids.append(str(dispatch_channel.id))
            query = dict(channel_id__in=channel_ids)

    limit = int(request.args.get('limit', 20))
    offset = int(request.args.get('offset', 0))

    from solariat_bottle.db.queue_message import QueueMessage
    from solariat_bottle.db.post.base import Post
    messages = []
    for message in QueueMessage.objects(**query).limit(limit).skip(offset):
        post_data = post_to_data(Post(message.post_data))
        post_data['message_id'] = str(message.id)
        post_data['reserved_until'] = str(message.reserved_until)
        messages.append(post_data)
    return jsonify(channel_id=channel_id, limit=limit, offset=offset, result=messages,
                   total=QueueMessage.objects(**query).count())
Пример #5
0
def get_posts_by_id(post_id):
    """
    Returns QuerySet of posts list
    :param post_id: antive or tango id
    :return:
    """
    from solariat.db.fields import Binary
    from solariat_bottle.db.post.facebook import FacebookEventMap
    from solariat_bottle.db.post.twitter import TwitterEventMap
    from solariat_bottle.db.post.base import Post
    post_ids = []
    for em in FacebookEventMap.objects(native_id=post_id):
        post_ids.append(em.event_id)
    for em in TwitterEventMap.objects(native_id=post_id):
        post_ids.append(em.event_id)

    try:
        post_ids.append(Binary(post_id.decode('base64')))
    except Exception:
        pass
    try:
        post_ids.append(long(post_id))
    except Exception:
        pass

    return Post.objects(id__in=post_ids)
Пример #6
0
    def test_impressions(self):
        "Test impressions stats"

        pl1 = self._create_db_matchable('foo')
        pl2 = self._create_db_matchable('bar')
        pl3 = self._create_db_matchable('baz')

        response = self.do_post('posts',
                                version='v1.2',
                                channel=str(self.channel.id),
                                content='i need a foo for bar but not baz')

        post_dict = response['item']

        #matchables = post_dict['matchables']

        response = self.do_post('postmatches',
                                version='v1.2',
                                post=post_dict['id'],
                                impressions=[str(pl1.id),
                                             str(pl2.id)],
                                rejects=[str(pl3.id)])

        self.assertEqual(response['item']['rejects'][0], str(pl3.id))

        time_slot = timeslot.datetime_to_timeslot(Post.objects()[0].created)

        response = self.do_get('channelstats',
                               version='v1.2',
                               channel=str(self.channel.id),
                               time_slot=time_slot)  # month stats object
        stats = response['list'][0]
        self.assertEqual(stats['number_of_impressions'], 2)
Пример #7
0
    def test_api_case(self):
        """
        Verify with a test case that when a reply post is submittted 
        to the system via API end point that no update is made to the classifier
        """
        self.inbound.adaptive_learning_enabled = True
        self.inbound.save()
        self.inbound.reload()
        original_clf_hash = sha1(self.inbound.channel_filter.clf.packed_model)
        token = self.get_token()

        dummy_id = 'dummy_id'
        data = {
            'content': 'Test post',
            'lang': 'en',
            'channel': str(self.inbound.id),
            'token': token,
            'twitter': {
                'id': dummy_id
            }
        }
        resp = self.client.post('/api/v2.0/posts',
                                data=json.dumps(data),
                                content_type='application/json',
                                base_url='https://localhost')
        post_data = json.loads(resp.data)
        self.assertEqual(resp.status_code, 200)
        self.assertTrue(post_data['ok'])

        post = Post.objects(channels=self.inbound.id)[0]
        reply_data = {
            'content': 'Reply post',
            'lang': 'en',
            'channel': str(self.outbound.id),
            'token': token,
            'user_profile': {
                'screenname': 'random_screenname'
            },
            'twitter': {
                'in_reply_to_status_id': dummy_id,
                'id': 'reply_dummy_id'
            }
        }
        resp = self.client.post('/api/v2.0/posts',
                                data=json.dumps(reply_data),
                                content_type='application/json',
                                base_url='https://localhost')
        post.reload()
        post_data = json.loads(resp.data)
        self.assertEqual(resp.status_code, 200)
        self.assertTrue(post_data['ok'])

        self.inbound.channel_filter.reload()
        latest_clf_hash = sha1(self.inbound.channel_filter.clf.packed_model)
        self.assertNotEqual(original_clf_hash.hexdigest(),
                            latest_clf_hash.hexdigest())
Пример #8
0
    def select_and_reserve(self,
                           channel,
                           limit=DEFAULT_LIMIT,
                           reserve_time=DEFAULT_RESERVE_TIME):
        """
        Query batch of messages from database and reserve it until successful pull callback
        """
        from solariat_bottle.db.post.base import Post

        log_enabled = is_enabled(channel)
        query = {
            'channel_id': str(channel),
            'reserved_until': {
                '$lt': datetime.utcnow()
            }
        }
        messages = self.find(**query).limit(limit)

        result = []
        duplicate_count = 0
        queue_messages = []

        salt_length = 5
        batch_token = None
        deadline = datetime.utcnow() + timedelta(seconds=reserve_time)
        expired_tokens = set([])
        for message in messages:
            if batch_token is None:
                batch_token = '%s%s%s' % (datetime.utcnow().__hash__(),
                                          message.id,
                                          Random().getrandbits(salt_length))
            if message.batch_token:
                duplicate_count += 1
                # If we re-added these posts, then the token has expired
                expired_tokens.add(message.batch_token)
            message.reserved_until = deadline
            message.batch_token = batch_token
            message.save()
            if log_enabled:
                queue_messages.append(
                    Post(message.post_data).plaintext_content)
            result.append(message)

        if expired_tokens:
            self.coll.update({'batch_token': {
                '$in': list(expired_tokens)
            }}, {'$set': {
                'batch_token': None
            }},
                             multi=True)
        if log_enabled:
            get_logger(channel).info(
                u"QMD: Pulling / Reserving from queue messages: %s",
                unicode(queue_messages))
        return result, duplicate_count
Пример #9
0
 def test_channel_stats(self):
     content = 'I need a bike. I like Honda .'
     # self._create_db_matchable(url='google.com',
     #                         creative='search for bike here')
     post = self._create_db_post(content)
     self.assertTrue(post.id is not None)
     self.assertTrue(
         Post.objects(channels__in=[self.channel.id]).count(), 1)
     for stats in get_levels(self.channel):
         stats.reload()
         self.assertEqual(stats.number_of_posts, 1)
         self.assertEqual(stats.feature_counts['2'], 1)
         self.assertEqual(stats.feature_counts['4'], 1)
Пример #10
0
    def test_crud(self):
        post = self._create_db_post(content='I need a new moto bike', )
        self.assertTrue(
            post.user_tag.startswith('unknown')
            or post.user_tag.startswith('anonymous'))

        count = Post.objects.find(channels=str(self.channel.id)).count()
        self.assertEqual(count, 1)
        self.assertEqual(
            Post.objects.get(post.id)['content'], 'I need a new moto bike')
        post.delete()
        self.assertEqual(
            Post.objects(channels__in=[self.channel.id]).count(), 0)
Пример #11
0
    def _create_posts(self):
        past_created = now() - timedelta(minutes=7 * 24 * 60)
        post1 = self._create_db_post(_created=past_created,
                                     content='i need some carrot')

        past_created = now() - timedelta(minutes=7 * 24 * 60 + 10)
        post2 = self._create_db_post(_created=past_created,
                                     content='Where I can buy a carrot?')

        self._create_db_post(content='i need some carrot')
        self._create_db_post(content='Where I can buy a carrot?')

        self.assertEqual(
            Post.objects(channels__in=[self.channel.id]).count(), 4)
Пример #12
0
    def test_post_stats(self):
        self.assertEqual(ChannelStats.objects().count(), 0)
        response = self.do_post('posts',
                                version='v1.2',
                                channel=str(self.channel.id),
                                content='i need a foo for bar but not baz')

        # Should have allocated stats for each level
        self.assertEqual(ChannelStats.objects().count(), 3)

        post = Post.objects()[0]
        for stats in get_levels(self.channel, post.created):
            stats.reload()
            #print stats.to_dict()
            self.assertEqual(stats.number_of_posts, 1)
            self.assertEqual(stats.feature_counts, {'0': 1, '2': 1})
Пример #13
0
def su_print_conversation(user, conv_id):
    try:
        conv = Conversation.objects.get(long(conv_id))
    except (Conversation.DoesNotExist, ValueError):
        return jsonify(ok=False, error='Does not exists')

    from solariat_bottle.db.post.base import Post

    data = {
        'conversation_data': bson_safe(conv.data),
        'posts': []
    }

    for post in Post.objects(id__in=conv.posts):
        data['posts'].append(post_to_data(post))
    return jsonify(data)
Пример #14
0
    def clear_reserved_id_based(self, post_ids):
        reserver = self.find(id__in=post_ids)[:]
        channel_ids = []
        for message in reserver:
            channel_ids.extend(list(message.channel_id))
        if is_enabled(channel_ids):
            from solariat_bottle.db.post.base import Post

            queue_messages = []
            for message in reserver:
                queue_messages.append(
                    Post(message.post_data).plaintext_content)
            get_logger(channel_ids).info(
                u"QMD: Confirming / Clearing from queue messages: %s",
                unicode(queue_messages))
        return self.remove(id__in=post_ids)
Пример #15
0
    def _do_test(self, ds_bot, posts_before=0):
        self.assertEqual(Post.objects.count(), posts_before)

        if not ds_bot.is_running():
            self.start_bot(ds_bot)

        post_data = patch_created_at(SAMPLE_VALID_DATA, now())
        ds_bot.post_received(json.dumps(post_data))
        self.wait_bot(ds_bot)

        self.assertEqual(Post.objects.count(), posts_before + 1)
        created_post = Post.objects().sort(_created=-1).limit(1)[0]
        u_p = UserProfile.objects.get(user_name='user1_solariat')
        # Check fields that are required on user profile
        self.assertDictEqual(
            u_p.platform_data, {
                u'lang': u'en',
                u'statuses_count': 1905,
                u'screen_name': u'user1_solariat',
                u'friends_count': 13,
                u'name': u'user1_solariat',
                u'created_at': u'Tue, 07 May 2013 19:35:50 +0000',
                u'profile_image_url':
                u'http://pbs.twimg.com/profile_images/468781442852339712/69CJihsO_normal.jpeg',
                u'id': 1411050992,
                u'followers_count': 8,
                u'id_str': u'1411050992',
                u'location': u'San Francisco',
                u'profile_image_url_https':
                u'https://pbs.twimg.com/profile_images/468781442852339712/69CJihsO_normal.jpeg',
                u'description': u'Teacher'
            })
        self.assertEqual(created_post.content,
                         post_data['data']['twitter']['text'])
        self.assertTrue(
            str(self.channel.inbound_channel.id) in
            created_post.channel_assignments)
        self.assertEqual(
            created_post.channel_assignments[str(
                self.channel.inbound_channel.id)], 'highlighted')
        # Check that we actually hold everything in wrapped data
        print(created_post.wrapped_data)
        print(post_data['data'])
        self.assertDictEqual(created_post.wrapped_data, post_data['data'])
Пример #16
0
    def remove_reserved(self, batch_token):
        '''
        Remove all records from database with provided batch_token
        '''
        reserver = self.find(**{'batch_token': batch_token})[:]
        channel_ids = []
        for message in reserver:
            channel_ids.extend(list(message.channel_id))
        if is_enabled(channel_ids):
            from solariat_bottle.db.post.base import Post

            queue_messages = []
            for message in reserver:
                queue_messages.append(
                    Post(message.post_data).plaintext_content)
            get_logger(channel_ids).info(
                u"QMD: Confirming / Clearing from queue messages: %s",
                unicode(queue_messages))
        return self.remove(**{'batch_token': batch_token})
Пример #17
0
    def setUp(self):
        MainCase.setUp(self)
        past_id = ObjectId.from_datetime(
            datetime.now() - timedelta(minutes=7*24*60))
        post1 = self._create_db_post(
            id=past_id, channel=self.channel,
            content = 'i need some foo')
        #Post.objects.insert(post1.data)

        past_id = ObjectId.from_datetime(
            datetime.now() - timedelta(minutes=7*24*60+10))
        post2 = self._create_db_post(
            id=past_id, channel=self.channel,
            content='where i can find a foo?')
        #Post.objects.insert(post2.data)

        post3 = self._create_db_post(
            channel=self.channel, content='i need some foo')
        post4 = self._create_db_post(
            channel=self.channel, content='where i can find a foo?')
        post5 = self._create_db_post(channel=self.channel, content='LOL')
        self.assertEqual(Post.objects(channels__in=[self.channel.id]).count(), 5)
Пример #18
0
    def setUp(self):
        UICase.setUp(self)
        self.login()
        first_date = utc(datetime(2012, 1, 1))
        post1 = self._create_db_post(_created=first_date,
                                     content='i need some carrot')
        self.assertEqual(
            Post.objects(channels__in=[self.channel.id]).count(), 1)

        # 1 jan + 10 minutes
        second_date = first_date + timedelta(minutes=10)
        post2 = self._create_db_post(_created=second_date,
                                     content='where i can buy a carrot?')
        self.assertEqual(
            Post.objects(channels__in=[self.channel.id]).count(), 2)

        # 1 jan + 7 days
        third_date = first_date + timedelta(minutes=7 * 60 * 24)
        post3 = self._create_db_post(_created=third_date,
                                     content='i need some carrot')
        self.assertEqual(
            Post.objects(channels__in=[self.channel.id]).count(), 3)

        forth_date = third_date + timedelta(minutes=10)
        post4 = self._create_db_post(_created=forth_date,
                                     content='where i can buy a carrot?')
        self.assertEqual(
            Post.objects(channels__in=[self.channel.id]).count(), 4)

        # This will not be created, only for stats
        post5 = Post(channels=[self.channel.id],
                     content='LOL',
                     actor_id=post4.user_profile.id,
                     is_inbound=True,
                     _native_id='1',
                     _created=post4._created)
        self.assertEqual(
            Post.objects(channels__in=[self.channel.id]).count(), 4)
        no_post_created(post5, utc(forth_date + timedelta(minutes=10)))
        self.now = now()
Пример #19
0
def fetch_posts(channels,
                start_ts,
                end_ts,
                topics,
                statuses,
                intentions,
                min_conf,
                agents,
                sort_by='time',
                limit=100,
                message_type=None,
                create_date_limit=None,
                languages=None):

    from solariat_bottle.db.post.utils import get_platform_class
    from solariat_bottle.db.channel.base import Channel
    from solariat.db.fields import BytesField

    # --- Preliminary range query for the core matching elements ---
    topics = [
        t if isinstance(t, dict) else dict(topic=t, topic_type='leaf')
        for t in topics
    ]

    to_binary = BytesField().to_mongo
    match_query_base = []

    for channel in channels:
        for status in statuses:
            # compute id bounds for all posts for this slot
            id_lower_bound = pack_speech_act_map_id(channel, status, start_ts,
                                                    0)
            id_upper_bound = pack_speech_act_map_id(channel, status, end_ts,
                                                    BIGGEST_POST_VALUE)

            # add an id-constraining query
            assert start_ts <= end_ts
            assert id_upper_bound >= id_lower_bound

            match_query_base.append({
                '_id': {
                    "$gte": to_binary(id_lower_bound),
                    "$lte": to_binary(id_upper_bound)
                }
            })

    primary_filter = {"$or": match_query_base}

    # Add intention restrictions, which operate in the main fields
    primary_filter["ic"] = {"$gte": min_conf}
    if intentions:
        primary_filter["ii"] = {"$in": intentions}

    if message_type is not None:
        primary_filter["mtp"] = {"$in": message_type}

    # Constrain for agents, again, at the primary level
    if agents:
        primary_filter["at"] = {"$in": agents}

    if languages:
        from solariat_bottle.db.channel_trends import make_lang_query

        primary_filter = {
            "$and": [
                primary_filter,
                make_lang_query(languages, SpeechActMap.language.db_field)
            ]
        }

    pipeline = [{"$match": primary_filter}]

    # Generate Secondary Filter only if we have topic constraints.
    topics_match_query = []
    for topic in topics:
        if topic['topic'] != ALL_TOPICS:
            topics_match_query.append({
                'tt.l': topic['topic_type'] == 'leaf',
                'tt.t': topic['topic']
            })

    if topics_match_query:
        pipeline.append({"$unwind": "$tt"})
        if len(topics_match_query) == 1:
            pipeline.append({"$match": topics_match_query[0]})
        else:
            pipeline.append({"$match": {"$or": topics_match_query}})

    # First impose a limit because we cannot spend all day fetching data, and in the worst
    # case, the data could be huge. So this limit is selected as a reasonable case for searching
    # posts. We also allow the input param to over-ride this value if it exceeds it.
    pipeline.append({"$limit": max(10000, limit)})

    # We want the data in sorted order in general.
    pipeline.append({"$sort": {"ca": -1}})

    # Now throttle the resulst to a workable page, where specified

    platform = None
    for ch in channels:
        if not isinstance(ch, Channel):
            ch = Channel.objects.get(ch)
        channel_platform = ch.platform
        if platform and platform != channel_platform:
            # TODO: Is this the correct approach or should we just
            # return a bunch of base posts objects in this case ?
            raise AppException(
                "Trying to fetch posts over multiple platforms!")
        else:
            platform = channel_platform

    # Use the correct class depending on the platform we are searching for
    Post = get_platform_class(platform)

    are_more_speech_acts_fetched = True
    len_res_result = 0
    # we start with such limit because there are
    # ~2 speech acts per post on average
    sa_limit = 2 * limit
    posts = set([])

    # posts are created from speech acts (SA)
    # there may be several SAs for one post
    # we keep increasing `sa_limit` for the SA query until n=limit posts are fetched
    # or until no more SA are fetched
    while len(posts) < limit and are_more_speech_acts_fetched:

        pipeline.append({"$limit": sa_limit})
        res = SpeechActMap.objects.coll.aggregate(pipeline)
        new_posts = Post.objects(
            id__in=list(set([r['pt'] for r in res['result']])))
        if create_date_limit:
            new_posts = [
                p for p in new_posts if p.created_at < create_date_limit
            ]
        posts.update(set(new_posts))
        if len_res_result < len(res['result']):
            len_res_result = len(res['result'])
            sa_limit = 2 * sa_limit
        else:
            are_more_speech_acts_fetched = False

        # we add new limit to the pipeline in the beginning of the while loop
        del pipeline[-1]

    posts = list(posts)
    posts.sort(key=lambda p: p.created_at, reverse=True)

    # start_time = datetime.now()
    #LOGGER.debug("PostManager.by_time_point Aggregated and retrieved in %s sec. Result=%d",
    #                 datetime.now()-start_time,
    #                 len(posts))
    #LOGGER.debug("PostManager.by_time_point Pipeline=\n%s", pprint.pformat(pipeline))

    return posts
Пример #20
0
def purge_channel_outdated_posts_and_sas(channel,
                                         now_date=None,
                                         run_in_prod_mod=False):
    """ 
    purges outdated posts and sas 
    basing on CHANNEL_ENTITIES_KEEP_DAYS setting
    """
    today_dt = now_date if now_date else utc(now())
    delta = relativedelta(days=get_var("CHANNEL_ENTITIES_KEEP_DAYS"))
    to_dt = today_dt - delta

    # counting chunks
    CHUNK_SIZE = 100
    post_number = Post.objects(channels=str(channel.id),
                               _created__lt=to_dt).count()
    res = {'post_total': 0, 'sas_total': 0}
    if not post_number:
        LOGGER.info("purge_outdated_posts:: %s: no posts to purge" %
                    channel.title)
        return res
    chunks_number = post_number / CHUNK_SIZE + 1
    start_dt = datetime.now()

    # handling posts and sas chunk by chunk
    for i in range(chunks_number):
        offset = i * CHUNK_SIZE
        t0 = datetime.now()
        # getting posts for removal
        if run_in_prod_mod:
            post_query = Post.objects(channels=str(channel.id),
                                      _created__lt=to_dt).limit(CHUNK_SIZE)
        else:
            post_query = Post.objects(
                channels=str(channel.id),
                _created__lt=to_dt).limit(CHUNK_SIZE).skip(offset)
        posts = [p for p in post_query]
        post_query = None
        LOGGER.info(
            'purge_outdated_posts:: %s: chunk #%s of %s chunks (%s posts_number; post query timedelta: %s',
            channel.title, i, chunks_number, post_number,
            datetime.now() - t0)
        post_ids = [
            long(p.id)
            if isinstance(p.id, (str, unicode)) and p.id.isdigit() else p.id
            for p in posts
        ]

        if run_in_prod_mod:
            # perform actual removal
            t0 = datetime.now()
            sas_res = SpeechActMap.objects.coll.remove(
                SpeechActMap.objects.get_query(post__in=post_ids))
            post_res = Post.objects.coll.remove(
                Post.objects.get_query(id__in=post_ids))
            LOGGER.info(
                'purge_outdated_posts:: %s: post removed: %s; sas removed: %s;'
                ' chunk #%s of %s chunks; sas and post'
                ' remove queries timedelta: %s', channel.title, post_res['n'],
                sas_res['n'], i, chunks_number,
                datetime.now() - t0)
            res['post_total'] += post_res['n']
            res['sas_total'] += sas_res['n']
        else:
            t0 = datetime.now()
            # getting sas for removal
            sas = [s for s in SpeechActMap.objects(post__in=post_ids)]
            LOGGER.info(
                'purge_outdated_posts:: %s: chunk #%s of %s chunks; sas count: %s; sas query timedelta: %s',
                channel.title, i, chunks_number, len(sas),
                datetime.now() - t0)
            res['post_total'] += len(posts)
            res['sas_total'] += len(sas)
    LOGGER.info('purge_outdated_posts:: %s: total timedelta: %s; stats: %s',
                channel.title,
                datetime.now() - start_dt, res)
    return res
Пример #21
0
    def test_queue_integration(self):
        """ Covers full integration from starting Subscriber,
            through TwitterTimelineRequest's fetchers, HistoricLoader
            until PostCreator.create_post().
        """
        from solariat_bottle.settings import LOGGER
        from solariat_bottle.db.channel.twitter import TwitterServiceChannel
        from solariat_bottle.db.historic_data import QueuedHistoricData
        from solariat_bottle.db.post.base import Post
        from solariat_bottle.daemons.twitter.historics.timeline_request import \
            DirectMessagesRequest, SentDirectMessagesRequest, SearchRequest, UserTimelineRequest
        from solariat_bottle.db.user_profiles.user_profile import UserProfile

        # reduce amount of data for long-running integration test
        FakeTwitterApi.SEARCH_DATA_LENGTH = 50
        FakeTwitterApi.TIMELINE_DATA_LENGTH = 50
        FakeTwitterApi.DM_DATA_LENGTH = 50
        FakeTwitterApi.DM_SENT_DATA_LENGTH = 50
        FakeTwitterApi.ALL_DATA_LENGTH = 200
        FakeTwitterApi.CREATED_FROM = FakeTwitterApi.CREATED_TO - timedelta(days=1)
        FakeTwitterApi.init_next_params()
        SearchRequest.SEARCH_LIMIT = 10
        UserTimelineRequest.FETCH_LIMIT = 20
        DirectMessagesRequest.DIRECT_MESSAGES_LIMIT = 20
        SentDirectMessagesRequest.DIRECT_MESSAGES_LIMIT = 20

        profile = UserProfile.objects.upsert('Twitter', profile_data=dict(user_name='jarvis', user_id='99188210'))
        channel = TwitterServiceChannel.objects.create_by_user(self.user, title='SC')
        channel.add_username(profile.user_name)
        channel.add_keyword(u'keywörd')

        def get_id_date_pair(post_data):
            if 'twitter' in post_data:
                post_data = post_data['twitter']
            return int(post_data['id']), post_data['created_at']

        fetched_data = []
        def _save_tweets(fn):
            def decorated(tweets, *args, **kwargs):
                LOGGER.debug('PUSH_POSTS, len:%s', len(tweets))
                fetched_data.extend([get_id_date_pair(t) for t in tweets])
                return fn(tweets, *args, **kwargs)
            return decorated

        queued_data = []
        def _save_queued_data(method):
            def _method(*args, **kwargs):
                queued_data[:] = [
                    get_id_date_pair(i.solariat_post_data) for i in
                    QueuedHistoricData.objects(subscription=subscription)
                ]
                LOGGER.debug('QUEUED_POSTS, len: %s', len(queued_data))
                self.assertTrue(len(queued_data) == FakeTwitterApi.ALL_DATA_LENGTH,
                                msg="len=%d %s" % (len(queued_data), queued_data))
                self.assertEqual(set(queued_data), set(fetched_data),
                                 msg=u"\nqueued =%s\nfetched=%s" % (queued_data, fetched_data))
                return method(*args, **kwargs)
            return _method

        subscription = TwitterRestHistoricalSubscription.objects.create(
            created_by=self.user,
            channel_id=channel.id,
            from_date=FakeTwitterApi.CREATED_FROM,
            to_date=FakeTwitterApi.CREATED_TO
        )
        subscriber = TwitterHistoricsSubscriber(subscription)
        subscriber.push_posts = _save_tweets(subscriber.push_posts)
        subscriber.historic_loader.load = _save_queued_data(subscriber.historic_loader.load)

        subscriber.start_historic_load()
        self.assertEqual(subscriber.get_status(), SUBSCRIPTION_FINISHED)

        self.assertEqual(Post.objects(channels__in=[
            subscription.channel.inbound,
            subscription.channel.outbound]).count(), FakeTwitterApi.ALL_DATA_LENGTH)

        SearchRequest.SEARCH_LIMIT = 100
        UserTimelineRequest.FETCH_LIMIT = 200
        DirectMessagesRequest.DIRECT_MESSAGES_LIMIT = 200
        SentDirectMessagesRequest.DIRECT_MESSAGES_LIMIT = 200
Пример #22
0
 def get_post_by_content(self, content):
     # since content is encrypted, scan all posts
     post = [p for p in Post.objects() if p.content == content][0]
     return post
Пример #23
0
    def test_multi_post(self):
        contents = [
            'Any recommendations for a basketball scholarship? I need a basketball scholarship.',
            'Any recommendations for a basketball scholarship? I need a basketball scholarship.',
            'I love my display!',
            'My display is just not working out for me :-(',
            'Any recommendations for a display?', 'I like my display'
        ]

        for content in contents:
            post = self._create_db_post(content, channel=self.channel)

        from solariat_bottle.db.speech_act import SpeechActMap

        stats_by_topic_intention = {}

        #Calculating stats iterating through SAM
        from solariat_bottle.db.post.base import Post
        for post in Post.objects(channels__in=[self.channel.id]):
            for sa in post.speech_acts:
                topics = sa['intention_topics']
                int_id = sa['intention_type_id']
                topics.append('__ALL__')
                for topic in topics:
                    if topic in stats_by_topic_intention:
                        if str(int_id) in stats_by_topic_intention[topic]:
                            stats_by_topic_intention[topic][str(int_id)] += 1
                        else:
                            stats_by_topic_intention[topic][str(int_id)] = 1
                    else:
                        stats_by_topic_intention[topic] = {str(int_id): 1}

        expected_stats_from_sam = {
            u'basketball scholarship': {
                '1': 2,
                '2': 2
            },
            u'display': {
                '1': 1,
                '3': 1,
                '4': 2
            },
            '__ALL__': {
                '1': 3,
                '3': 1,
                '2': 2,
                '4': 2
            }
        }

        self.assertDictEqual(stats_by_topic_intention, expected_stats_from_sam)

        time_slot = datetime_to_timeslot(
            Post.objects(
                channels__in=[self.channel.id]).limit(1)[0].created_at, 'hour')
        status = SpeechActMap.ACTIONABLE

        #Now verify SAM stats correspond to ChannelTopicTrends stats
        for topic, sa_stats in stats_by_topic_intention.iteritems():
            if topic == '__ALL__':
                continue

            stat = ChannelTopicTrends(channel=self.channel,
                                      time_slot=time_slot,
                                      topic=topic,
                                      status=status)
            stat.reload()
            ctt_by_int = {}
            filtered = stat.filter(is_leaf=True, intention__ne=0)

            for s in filtered:
                ctt_by_int[str(s.intention)] = s.topic_count
            self.assertDictEqual(ctt_by_int, sa_stats)
Пример #24
0
def more_like_post(post, channel):
    """
    Returns a queryset of similar posts in a given channels.
    Similarity determined by list of topics and intentions of the initial post.
    Note that we are looking for posts that are similar, but with opposite
    status, since we want to re-lable
    """
    from solariat_bottle.db.post.base    import Post
    from solariat_bottle.db.speech_act   import SpeechActMap
    from solariat_bottle.db.channel.base import Channel
    from solariat_bottle.db.conversation import Conversation

    from solariat.utils.timeslot import Timeslot, DURATION_DAY

    topics        = []
    intention_ids = []
    channel = Channel.objects.ensure_channels([channel])[0]
    assignment = post.get_assignment(channel)
    if channel.is_smart_tag:
        # for smart tags lookup similar posts in parent channel
        parent_channel = Channel.objects.get(channel.parent_channel)
        status = [SpeechActMap.POTENTIAL, SpeechActMap.ACTIONABLE, SpeechActMap.ACTUAL, SpeechActMap.REJECTED]
    else:
        parent_channel = channel
        status = [SpeechActMap.POTENTIAL]
        if assignment in SpeechActMap.ASSIGNED:
            ''' Postitive assignment could cause a more precise classification
            of a Potential post and could revert the assignment for Rejected
            posts
            '''
            status.append(SpeechActMap.REJECTED)
        elif assignment in {'rejected', 'discarded'}:
            ''' Conversely, may reject potential posts and may cause a reversion
            of prior allocation for Actionable
            '''
            status.append(SpeechActMap.ACTIONABLE)
        else:
            raise AppException("An internal state is not expected: %s. Please contact support for assistance." % assignment)

    for sa in post.speech_acts:
        topics.extend(sa['intention_topics'])
        intention_ids.append(sa['intention_type_id'])

    # The basic post lookup that just searches for the latest objects
    res, more_posts_available = Post.objects.by_time_point(
                                    parent_channel,
                                    ['__ALL__'],
                                    from_ts   = Timeslot(post.created_at-DURATION_DAY),
                                    to_ts     = Timeslot(post.created_at+timedelta(hours=1)),
                                    status    = status,
                                    intention = intention_ids,
                                    languages = [post.language],
                                    limit     = 10)
    res = set(res)

    if (channel.is_smart_tag):
        # Part of new re-labeling. If tag for a post is rejected, we should
        # go through all posts from the post conversation and through first
        # RESPONSE_DEPTH_FACTOR responses containing the tag
        service_channel = get_service_channel(channel)
        if service_channel:
            conversations = Conversation.objects.lookup_conversations(service_channel, [post])

            if len(conversations) == 1:
                # First extend with all other posts from this conversation that have that tag
                # assigned to them
                res |= set([p for p in Post.objects(id__in=list(conversations[0].posts))
                              if (str(p.id) != str(post.id) and str(channel.id) in p.tag_assignments)])
        # Now go through the first RESPONSE_DEPTH_FACTOR responses which have that tag assigned

    elif (not channel.is_smart_tag and
            SpeechActMap.STATUS_MAP[post.get_assignment(channel)] in [SpeechActMap.ACTIONABLE, SpeechActMap.REJECTED]):
        # In case we reject a post, go through all the posts for the first RESPONSE_DEPTH_FACTOR responses from
        # the same service channel
        channels = [channel]
        if channel.parent_channel is not None:
            service_channel   = Channel.objects.get(id=channel.parent_channel)
            channels.append(service_channel)
        channel_filter = [ c.id for c in channels ]
        channel_filter_refs = [DBRef('Channel', ch) for ch in channel_filter]
        if SpeechActMap.STATUS_MAP[post.get_assignment(channel)] == SpeechActMap.REJECTED:
            target_status = [SpeechActMap.POTENTIAL, SpeechActMap.ACTIONABLE]
        else:
            target_status = [SpeechActMap.POTENTIAL, SpeechActMap.REJECTED]
    return list(res)