def get_feature_vector(self, tweet):
        """
        Feature vector contains
        1) 1 or 0 if word is contained in query set
        2) 1 or 0 if user is contained in query set
        3) missed = # words not in query set / # words in tweet
        @param tweet: A tweet in the twitter json format
        @return list: feature vector
        """
        text = tweet['text']
        hashtags, user_mentions, urls = extract_entity(tweet)
        user_mentions.append(tweet['user']['str_id'])

        filtered_text = []

        # remove hashtags from text
        text_tokenized = word_tokenize(text)
        text_cleaned = clean_entity(text_tokenized)
        for term in text_cleaned:
            if term not in self.stop_words:
                filtered_text.append(term)

        filtered_text.extend(hashtags)

        term_features = [1.0 if term in filtered_text else 0.0 for term in self.query_set['keyword']]
        user_features = [1.0 if user in user_mentions else 0.0 for user in self.query_set['user']]
        #TODO: include users as well?
        missed = float(sum(term_features)) / len(filtered_text)

        user_features.append(missed)
        term_features.extend(user_features)

        return term_features
예제 #2
0
    def pos_term_selection(self):
        """
        Get a positive sample of tweets. Tweets are stored
        in the twitter json format.
        1) Tokenize tweet
        2) Remove stopwords
        3) add to dict or count + 1
        """

        word_count = dict(TweetCount)
        user_count = dict(TweetCount)

        def _doc_freq(_term):
            word_count[_term]['doc_frequency'] += 1
            return

        def _add_term(_term, _index, _stopwords=list()):
            if _term in _stopwords:
                return
            if _term in _index:
                _index[_term] += 1
            else:
                _index[_term] = TweetCount(1, 0)

        pos_tweets = self.get_pos_tweets()
        for tweet in pos_tweets:
            text = tweet['text']
            text_tokenized = word_tokenize(text)
            text_cleaned = clean_entity(text_tokenized)
            for term in text_cleaned:
                _add_term(term, word_count, self.stop_words)

            hashtags, user_mentions = extract_entity(tweet)

            user_mentions.append(tweet['user']['id_str'])
            for hashtag in hashtags:
                _add_term(hashtag, word_count, self.stop_words)

            for user in user_mentions:
                _add_term(user, user_count)

            hashtags.extend(user_mentions)
            text_cleaned.extend(hashtags)

            map(_doc_freq, set(text_cleaned))

        self.query_set['keyword'] = [
            x for x in word_count if word_count[x]['frequency'] > 10
        ]
        self.query_set['user'] = [
            x for x in user_count if user_count[x]['frequency'] > 10
        ]
    def pos_term_selection(self):
        """
        Get a positive sample of tweets. Tweets are stored
        in the twitter json format.
        1) Tokenize tweet
        2) Remove stopwords
        3) add to dict or count + 1
        """

        word_count = dict(TweetCount)
        user_count = dict(TweetCount)

        def _doc_freq(_term):
            word_count[_term]['doc_frequency'] += 1
            return

        def _add_term(_term, _index, _stopwords=list()):
            if _term in _stopwords:
                return
            if _term in _index:
                _index[_term] += 1
            else:
                _index[_term] = TweetCount(1, 0)

        pos_tweets = self.get_pos_tweets()
        for tweet in pos_tweets:
            text = tweet['text']
            text_tokenized = word_tokenize(text)
            text_cleaned = clean_entity(text_tokenized)
            for term in text_cleaned:
                _add_term(term, word_count, self.stop_words)

            hashtags, user_mentions = extract_entity(tweet)

            user_mentions.append(tweet['user']['id_str'])
            for hashtag in hashtags:
                _add_term(hashtag, word_count, self.stop_words)

            for user in user_mentions:
                _add_term(user, user_count)

            hashtags.extend(user_mentions)
            text_cleaned.extend(hashtags)

            map(_doc_freq, set(text_cleaned))

        self.query_set['keyword'] = [x for x in word_count if word_count[x]['frequency'] > 10]
        self.query_set['user'] = [x for x in user_count if user_count[x]['frequency'] > 10]
예제 #4
0
    def get_feature_vector(self, tweet):
        """
        Feature vector contains
        1) 1 or 0 if word is contained in query set
        2) 1 or 0 if user is contained in query set
        3) missed = # words not in query set / # words in tweet
        @param tweet: A tweet in the twitter json format
        @return list: feature vector
        """
        text = tweet['text']
        hashtags, user_mentions, urls = extract_entity(tweet)
        user_mentions.append(tweet['user']['str_id'])

        filtered_text = []

        # remove hashtags from text
        text_tokenized = word_tokenize(text)
        text_cleaned = clean_entity(text_tokenized)
        for term in text_cleaned:
            if term not in self.stop_words:
                filtered_text.append(term)

        filtered_text.extend(hashtags)

        term_features = [
            1.0 if term in filtered_text else 0.0
            for term in self.query_set['keyword']
        ]
        user_features = [
            1.0 if user in user_mentions else 0.0
            for user in self.query_set['user']
        ]
        #TODO: include users as well?
        missed = float(sum(term_features)) / len(filtered_text)

        user_features.append(missed)
        term_features.extend(user_features)

        return term_features