def get_feature_vector(self, tweet): """ Feature vector contains 1) 1 or 0 if word is contained in query set 2) 1 or 0 if user is contained in query set 3) missed = # words not in query set / # words in tweet @param tweet: A tweet in the twitter json format @return list: feature vector """ text = tweet['text'] hashtags, user_mentions, urls = extract_entity(tweet) user_mentions.append(tweet['user']['str_id']) filtered_text = [] # remove hashtags from text text_tokenized = word_tokenize(text) text_cleaned = clean_entity(text_tokenized) for term in text_cleaned: if term not in self.stop_words: filtered_text.append(term) filtered_text.extend(hashtags) term_features = [1.0 if term in filtered_text else 0.0 for term in self.query_set['keyword']] user_features = [1.0 if user in user_mentions else 0.0 for user in self.query_set['user']] #TODO: include users as well? missed = float(sum(term_features)) / len(filtered_text) user_features.append(missed) term_features.extend(user_features) return term_features
def pos_term_selection(self): """ Get a positive sample of tweets. Tweets are stored in the twitter json format. 1) Tokenize tweet 2) Remove stopwords 3) add to dict or count + 1 """ word_count = dict(TweetCount) user_count = dict(TweetCount) def _doc_freq(_term): word_count[_term]['doc_frequency'] += 1 return def _add_term(_term, _index, _stopwords=list()): if _term in _stopwords: return if _term in _index: _index[_term] += 1 else: _index[_term] = TweetCount(1, 0) pos_tweets = self.get_pos_tweets() for tweet in pos_tweets: text = tweet['text'] text_tokenized = word_tokenize(text) text_cleaned = clean_entity(text_tokenized) for term in text_cleaned: _add_term(term, word_count, self.stop_words) hashtags, user_mentions = extract_entity(tweet) user_mentions.append(tweet['user']['id_str']) for hashtag in hashtags: _add_term(hashtag, word_count, self.stop_words) for user in user_mentions: _add_term(user, user_count) hashtags.extend(user_mentions) text_cleaned.extend(hashtags) map(_doc_freq, set(text_cleaned)) self.query_set['keyword'] = [ x for x in word_count if word_count[x]['frequency'] > 10 ] self.query_set['user'] = [ x for x in user_count if user_count[x]['frequency'] > 10 ]
def pos_term_selection(self): """ Get a positive sample of tweets. Tweets are stored in the twitter json format. 1) Tokenize tweet 2) Remove stopwords 3) add to dict or count + 1 """ word_count = dict(TweetCount) user_count = dict(TweetCount) def _doc_freq(_term): word_count[_term]['doc_frequency'] += 1 return def _add_term(_term, _index, _stopwords=list()): if _term in _stopwords: return if _term in _index: _index[_term] += 1 else: _index[_term] = TweetCount(1, 0) pos_tweets = self.get_pos_tweets() for tweet in pos_tweets: text = tweet['text'] text_tokenized = word_tokenize(text) text_cleaned = clean_entity(text_tokenized) for term in text_cleaned: _add_term(term, word_count, self.stop_words) hashtags, user_mentions = extract_entity(tweet) user_mentions.append(tweet['user']['id_str']) for hashtag in hashtags: _add_term(hashtag, word_count, self.stop_words) for user in user_mentions: _add_term(user, user_count) hashtags.extend(user_mentions) text_cleaned.extend(hashtags) map(_doc_freq, set(text_cleaned)) self.query_set['keyword'] = [x for x in word_count if word_count[x]['frequency'] > 10] self.query_set['user'] = [x for x in user_count if user_count[x]['frequency'] > 10]
def get_feature_vector(self, tweet): """ Feature vector contains 1) 1 or 0 if word is contained in query set 2) 1 or 0 if user is contained in query set 3) missed = # words not in query set / # words in tweet @param tweet: A tweet in the twitter json format @return list: feature vector """ text = tweet['text'] hashtags, user_mentions, urls = extract_entity(tweet) user_mentions.append(tweet['user']['str_id']) filtered_text = [] # remove hashtags from text text_tokenized = word_tokenize(text) text_cleaned = clean_entity(text_tokenized) for term in text_cleaned: if term not in self.stop_words: filtered_text.append(term) filtered_text.extend(hashtags) term_features = [ 1.0 if term in filtered_text else 0.0 for term in self.query_set['keyword'] ] user_features = [ 1.0 if user in user_mentions else 0.0 for user in self.query_set['user'] ] #TODO: include users as well? missed = float(sum(term_features)) / len(filtered_text) user_features.append(missed) term_features.extend(user_features) return term_features