Пример #1
0
    def label_by_content(self, reset_all=False):
        '''
        Label tweet by it's content
        '''

        import jieba
        jieba.load_userdict("/etc/jieba/jieba.dic")


        logging.info('Start labeling tweets by content')
        page_id=0; page_size = 1000; over = False; cnt = 0

        while not over:

            if reset_all:
                # fetch all original tweet
                tweets = tweet.objects.filter(retweeted_status__exact=None)[page_id:page_id+page_size]
            else:
                # Only fetch original tweets have not been predicted before
                tweets = tweet.objects.filter(retweeted_status__exact=None)\
                        .filter(predict_category=0)[page_id:page_id+page_size]

            logging.info('Fetched %d tweets for labeling' % len(tweets))

            if not tweets or len(tweets)<page_size:
                over = True
            else:
                page_id += len(tweets)

            for tweet in tweets:

                retweet_statuses = tweet.retweet_status.all()
                # consider comments

                tweet.predict_category = Category.NO_CATEGORY # reset category

                content = tweet.text
                for rwb in retweet_statuses:
                    rwb.predict_category = Category.NO_CATEGORY
                    content += rwb.text

                logging.info(u'predicting %s(with %d retweets) -- %s' % (tweet, len(retweet_statuses), tweet.text))

                for word in jieba.cut(content.lower()):
                    # find one keyword, and classify in this category
                    if word in self.keyword_set:
                        logging.info(u'%s Predict %s belongs to %s' % (cnt, tweet, self.category))
                        tweet.predict_category = self.category_id
                        for rwb in retweet_statuses:
                            rwb.predict_category = self.category_id
                        cnt += 1
                        break
                tweet.save()

        logging.info('Labeled %d new tweet in %s' % (cnt, self.category))
Пример #2
0
    def label_by_user(self, threshold=3, reset_all=False):
        '''
        Label tweet By Users related
        '''

        logging.info('Start labeling tweets by users related to it')
        page_id=0; page_size = 1000; over = False;
        cnt0=0; cnt1=0

        while not over:

            if reset_all:
                # fetch all original tweet
                tweets = tweet.objects.filter(retweeted_status__exact=None)[page_id:page_id+page_size]
            else:
                # Only fetch original tweets have not been predicted before
                tweets = tweet.objects.filter(retweeted_status__exact=None)\
                        .filter(predict_category=0)[page_id:page_id+page_size]
						
            if not tweets or len(tweets)<page_size:
                over = True
            else:
                page_id += len(tweets)

            for tweet in tweets:

                logging.info("analyzing tweet:%s" % tweet)

                retweets = tweet.retweet_status.all()
                comments = tweet.comments.all()
                logging.info("%d retweets, %d comments" % (len(retweets), len(comments)))
                pos_cnt = 0; neg_cnt = 0
                for rt in list(chain(retweets, comments)):
                    if rt.owner.predict_category == self.category_id:
                        pos_cnt += 1
                    else:
                        neg_cnt += 1

                logging.info("%d positive users, %d negtive users" % (pos_cnt, neg_cnt))

                if pos_cnt>threshold:
                    cnt1+=1
                    logging.info("predict it as postive:%s" % tweet.text)
                    logging.info("pos/neg : %d/%d" % (cnt1, cnt0))
                    tweet.predict_category = self.category_id
                else:
                    cnt0+=1
                    logging.debug("predict it as negtive")
                    tweet.predict_category = Category.NO_CATEGORY# reset category

                tweet.save()

        logging.info('Labeled %d new tweet in %s' % ((cnt0+cnt1), self.category))