예제 #1
0
def getEmoticons(tokenised_tweet, emo_dict):
    feature_emoticons = {FEATURE_COUNT: 0, FEATURE_VALUE: {}}
    for word in tokenised_tweet:
        if isEmoticon(word, emo_dict):
            feature_emoticons[FEATURE_COUNT] += 1
            addToCountDict(feature_emoticons[FEATURE_VALUE], word, 1)
    return feature_emoticons
예제 #2
0
def getUniqueFeaturesForClass(processed_tweets_list,
                              categorywide_unique_features,
                              features_used=FEATURES_SA_DEFAULT):
    '''
    Params:
        processed_tweets_list: [] of tweets of class
        categorywide_unique_features: {} to keep track of unique features of a category e.g. apple, google,...
        features_used: [] of features in use from CS4242_Assg2.constants
    Return:
        class_unique_features: { FEATURE_TYPE_... : {feature: value}}
    '''
    class_unique_features = {}
    for feature in features_used:
        # TODO: whitelist
        if feature == FEATURE_SA_REPLIES or feature == FEATURE_SA_TEMPORAL:
            continue
        class_unique_features[feature] = {}

    for processed_tweet in processed_tweets_list:
        for feature in features_used:
            # TODO: whitelist
            if feature == FEATURE_SA_REPLIES or feature == FEATURE_SA_TEMPORAL:
                continue

            val_dict = processed_tweet[TWEET_FEATURES][feature][FEATURE_VALUE]
            for key, val in val_dict.iteritems():
                addToCountDict(categorywide_unique_features[feature], key, 1)
                addToCountDict(class_unique_features[feature], key, 1)
    return class_unique_features
예제 #3
0
def getCapitalisedText(tokenised_tweet):
    feature_caps = {FEATURE_COUNT: 0, FEATURE_VALUE: {}}
    for word in tokenised_tweet:
        #         split_words = removePunctuationsAndNumbers(word)
        #         for w in split_words:
        if isFullCaps(word):
            feature_caps[FEATURE_COUNT] += 1
            addToCountDict(feature_caps[FEATURE_VALUE], word, 1)
    return feature_caps
예제 #4
0
def getHashTags_SA(json_data, tokenised_tweet_no_filter):
    feature_ht = {FEATURE_COUNT: 0, FEATURE_VALUE: {}}
    for ht in json_data['entities']['hashtags']:
        feature_ht[FEATURE_COUNT] += 1
        addToCountDict(feature_ht[FEATURE_VALUE], "#HT_" + ht['text'], 1)

    for ht in getUncapturedHashtags(tokenised_tweet_no_filter):
        feature_ht[FEATURE_COUNT] += 1
        addToCountDict(feature_ht[FEATURE_VALUE], "#HT_" + ht, 1)
    return feature_ht
예제 #5
0
def getPosFeatures(pos_tagged_tweet):
    '''
    {
        FEATURE_COUNT: number of POS (a,n,v,r,other) features in tweet
        FEATURE_VALUE: {pos_tag: number of pos_tag features in tweet}
    }
    '''
    feature_pos = {FEATURE_COUNT: 0, FEATURE_VALUE: {}}
    for word, pos in pos_tagged_tweet.iteritems():
        if pos != POS_OTHERS:
            feature_pos[FEATURE_COUNT] += 1
            addToCountDict(feature_pos[FEATURE_VALUE], "POS_TAG_%s" % pos, 1)
    return feature_pos
예제 #6
0
def extractTextFeatures(json_data):

    tweet_words = {}

    # Remove unnecessary tokens
    tweet_text = filterIrrelevantTokens(json_data)

    unicode_normalized_tweet = unicodedata.normalize(
        'NFKD', tweet_text).encode('ascii', 'ignore').lower()
    tweet_wordlist = normalizer.normalizeTweet(unicode_normalized_tweet)
    # tweet wordlist = ['word', 'word2'] etc

    for word in tweet_wordlist:
        stripped_punct_num_word = removePunctuationsAndNumbers(word)
        for w in stripped_punct_num_word:
            if w not in stopwords.words('english'):
                w2 = stemmer.stem(w)
                addToCountDict(tweet_words, w2, 1)
    return tweet_words
예제 #7
0
def getUserMentions_SA(json_data, tokenised_tweet_no_filter):
    '''
    Get user ids of users mentioned in tweet, include tweeter of tweet
    '''
    feature_obj = {FEATURE_COUNT: 0, FEATURE_VALUE: {}}
    for um in json_data['entities']['user_mentions']:
        feature_obj[FEATURE_COUNT] += 1
        addToCountDict(feature_obj[FEATURE_VALUE], "@UM_" + um['id_str'], 1)


#     addToCountDict(feature_obj[FEATURE_VALUE], "@UM_" + str(json_data['user']['id']), 1)

    for um in getUncapturedUserMentions(tokenised_tweet_no_filter):
        feature_obj[FEATURE_COUNT] += 1
        addToCountDict(feature_obj[FEATURE_VALUE], "@UM_" + um, 1)

    for um in feature_obj[FEATURE_VALUE]:
        feature_obj[FEATURE_VALUE][um] = 1
    return feature_obj
예제 #8
0
def getText(tokenised_tweet, negation_flags, use_negation=False):
    feature_text = {FEATURE_COUNT: 0, FEATURE_VALUE: {}}
    for word in tokenised_tweet:
        negated = False
        if word in negation_flags and negation_flags[word]:
            negated = True
        if isEmoticon(word, emo_dict):
            continue

        is_slang = isSlang(word, sd)
        if is_slang:
            translated_slang = translateSlangWord(word, sd)
            tokenised_slang = tokenizer.tokenize(translated_slang)
            if len(tokenised_slang) == 1:  # translated slang
                split_words = tokenised_slang
                is_slang = False
            else:  # don't remove punctuation/numbers
                split_words = [word]
#                 print split_words

        if not is_slang:
            split_words = removePunctuationsAndNumbers(word)
        for w in split_words:
            w = w.lower()
            if w in stopwords.words('english'):  # skip stopwords
                continue
            if not is_slang:  # stem if its not slang
                w = stemmer.stem(w)
            if isFullCaps(word):  # preserve case for words in full
                w = w.upper()
            if use_negation and negated:
                w = "NOT_%s" % w
#                 print w
            feature_text[FEATURE_COUNT] += 1
            addToCountDict(feature_text[FEATURE_VALUE], w, 1)

    # TODO: should we use presence? results will drop abit


#     for text in feature_text[FEATURE_VALUE]:
#         feature_text[FEATURE_VALUE][text] = 1
    return feature_text
예제 #9
0
def getPolarityPosCount(pos_tagged_tweet, tweet_word_polarity):
    '''
    {
        FEATURE_COUNT: number of text features of POS (a,n,v,r) with polarity in tweet
        FEATURE_VALUE: {polarity type: number of text features of POS (a,n,v,r) with polarity in tweet}
    }
    '''
    feature_pol = {FEATURE_COUNT: 0, FEATURE_VALUE: {}}

    for word, pos in pos_tagged_tweet.iteritems():
        #         word_polarity = getPolarityFromSWN(word, pos, swn) # empty string if word not found
        if pos == POS_OTHERS:
            continue
        word_polarity = getPolarityOfWord(word, tweet_word_polarity)
        if word_polarity == POLARITY_POSITIVE:
            feature_pol[FEATURE_COUNT] += 1
            addToCountDict(feature_pol[FEATURE_VALUE], "POLARTITY_PoS_POS", 1)
        if word_polarity == POLARITY_NEGATIVE:
            feature_pol[FEATURE_COUNT] += 1
            addToCountDict(feature_pol[FEATURE_VALUE], "POLARTITY_PoS_NEG", 1)
        if word_polarity == POLARITY_NEUTRAL:
            feature_pol[FEATURE_COUNT] += 1
            addToCountDict(feature_pol[FEATURE_VALUE], "POLARTITY_PoS_NEU", 1)

    return feature_pol
예제 #10
0
def getEmoticonPolarityCount(tokenised_tweet, emo_dict):

    feature_emoticons = {
        FEATURE_COUNT: 0,
        FEATURE_VALUE: {
            'EMO_V_POS': 0,
            'EMO_POS': 0,
            'EMO_NEU': 0,
            'EMO_NEG': 0,
            'EMO_V_NEG': 0
        }
    }
    for word in tokenised_tweet:
        if word in emo_dict:
            polarity = getEmoticonPolarity(word, emo_dict)
            if polarity == POLARITY_VERY_POSITIVE:
                feature_emoticons[FEATURE_COUNT] += 1
                addToCountDict(feature_emoticons[FEATURE_VALUE], 'EMO_V_POS',
                               1)
            elif polarity == POLARITY_POSITIVE:
                feature_emoticons[FEATURE_COUNT] += 1
                addToCountDict(feature_emoticons[FEATURE_VALUE], 'EMO_POS', 1)
            elif polarity == POLARITY_NEUTRAL:
                feature_emoticons[FEATURE_COUNT] += 1
                addToCountDict(feature_emoticons[FEATURE_VALUE], 'EMO_NEU', 1)
            elif polarity == POLARITY_NEGATIVE:
                feature_emoticons[FEATURE_COUNT] += 1
                addToCountDict(feature_emoticons[FEATURE_VALUE], 'EMO_NEG', 1)
            elif polarity == POLARITY_VERY_NEGATIVE:
                feature_emoticons[FEATURE_COUNT] += 1
                addToCountDict(feature_emoticons[FEATURE_VALUE], 'EMO_V_NEG',
                               1)


#             addToCountDict(feature_emoticons[FEATURE_VALUE], word, 1)
#     print feature_emoticons
    return feature_emoticons
예제 #11
0
def getCapitalisedTextPolarityCount(cap_text, tweet_word_polarity):
    feature_pol = {FEATURE_COUNT: 0, FEATURE_VALUE: {}}
    for word in cap_text[FEATURE_VALUE]:
        word_polarity = getPolarityOfWord(word, tweet_word_polarity)
        if word_polarity == POLARITY_POSITIVE:
            feature_pol[FEATURE_COUNT] += 1
            addToCountDict(feature_pol[FEATURE_VALUE], "POLARTITY_CAPS_POS", 1)
        if word_polarity == POLARITY_NEGATIVE:
            feature_pol[FEATURE_COUNT] += 1
            addToCountDict(feature_pol[FEATURE_VALUE], "POLARTITY_CAPS_NEG", 1)
        if word_polarity == POLARITY_NEUTRAL:
            feature_pol[FEATURE_COUNT] += 1
            addToCountDict(feature_pol[FEATURE_VALUE], "POLARTITY_CAPS_NEU", 1)
    return feature_pol
예제 #12
0
def getTwitterTokenCount(json_data, tokenised_tweet_no_filter):
    feature_obj = {FEATURE_COUNT: 0, FEATURE_VALUE: {}}
    for twitter_token, entity in json_data['entities'].iteritems():
        for value in entity:
            feature_obj[FEATURE_COUNT] += 1
            addToCountDict(feature_obj[FEATURE_VALUE],
                           "TWITTER_TOKEN_" + twitter_token, 1)

    for token in tokenised_tweet_no_filter:
        if re.match(r"^#", token):
            feature_obj[FEATURE_COUNT] += 1
            addToCountDict(feature_obj[FEATURE_VALUE],
                           'TWITTER_TOKEN_hashtags', 1)
        if re.match(r"^@", token):
            feature_obj[FEATURE_COUNT] += 1
            addToCountDict(feature_obj[FEATURE_VALUE],
                           'TWITTER_TOKEN_user_mentions', 1)
        if isValidUrl(token):
            feature_obj[FEATURE_COUNT] += 1
            addToCountDict(feature_obj[FEATURE_VALUE], 'TWITTER_TOKEN_urls', 1)


#     print feature_obj
    return feature_obj
예제 #13
0
def getPolarityTextCount(pos_tagged_tweet, tweet_word_polarity):
    '''
    {
        FEATURE_COUNT: number of text features with polarity
        FEATURE_VALUE: {text: polarity score of text}
    }
    '''

    feature_pol = {FEATURE_COUNT: 0, FEATURE_VALUE: {}}

    for word, pos in pos_tagged_tweet.iteritems():
        word_polarity = getPolarityOfWord(word, tweet_word_polarity)
        if word_polarity == POLARITY_POSITIVE:
            feature_pol[FEATURE_COUNT] += 1
            addToCountDict(feature_pol[FEATURE_VALUE], "POLARITY_TEXT_POS", 1)
        elif word_polarity == POLARITY_NEGATIVE:
            feature_pol[FEATURE_COUNT] += 1
            addToCountDict(feature_pol[FEATURE_VALUE], "POLARITY_TEXT_NEG", 1)
        elif word_polarity == POLARITY_NEUTRAL:
            feature_pol[FEATURE_COUNT] += 1
            addToCountDict(feature_pol[FEATURE_VALUE], "POLARITY_TEXT_NEU", 1)

    return feature_pol
예제 #14
0
def calculateChi2ValuesSA(unique_feat, feature_set, sample_size_set):
    '''
        Calculate Chi2 values of all unique features using lecture's formula
         
        Params:
            unique_feat: [feat] of unique features
            feature_set: {senti_type: features} of features from each sentiment
            sample_size_set: {senti_type:sample_tweet_count} of each sentiment
        Returns:
            chi2_val_list: {senti_type: {feat: chi2_val}}of calculated chi2 values
    '''

    #     print 'using calculateChi2Values...'
    sentiment_feature_set = {}
    for target_sentiment in feature_set.iterkeys():
        target_senti_feat_dict = feature_set[target_sentiment]
        target_senti_sample_size = sample_size_set[target_sentiment]

        # merge the other sentiment feature dictionaries
        other_senti_feat_dict = {}
        other_senti_sample_size = 0
        for sentiment in feature_set.iterkeys():
            if sentiment != target_sentiment:
                for item, value in feature_set[sentiment].iteritems():
                    addToCountDict(other_senti_feat_dict, item, value)
                other_senti_sample_size += sample_size_set[sentiment]

        target_sent_feat_set = {}
        target_sent_feat_set['target_feat_dict'] = target_senti_feat_dict
        target_sent_feat_set['target_sample_size'] = sum(
            target_senti_feat_dict.itervalues())
        target_sent_feat_set['other_feat_dict'] = other_senti_feat_dict
        target_sent_feat_set['other_sample_size'] = sum(
            other_senti_feat_dict.itervalues())
        sentiment_feature_set[target_sentiment] = target_sent_feat_set

#         target_senti_sample_size = sum(target_senti_feat_dict.itervalues())
#         other_senti_sample_size = sum(other_senti_feat_dict.itervalues())
#         print target_sent_feat_set['target_sample_size'], target_sent_feat_set['other_sample_size']
#         print sum(unique_feat.itervalues())
    chi2_val_dict = {}
    for target_sentiment in feature_set.iterkeys():
        target_sent_feat_set = sentiment_feature_set[target_sentiment]
        target_senti_feat_dict = target_sent_feat_set['target_feat_dict']
        target_senti_sample_size = target_sent_feat_set['target_sample_size']

        other_senti_feat_dict = target_sent_feat_set['other_feat_dict']
        other_senti_sample_size = target_sent_feat_set['other_sample_size']

        chi2_word_val = {}
        for f in target_senti_feat_dict.iterkeys():
            # calculate chi2
            A = 0  # num of tweets in target cat that contains feature
            B = 0  # num of tweets not in target cat that contains feature
            C = 0  # num of tweets in target cat that doesnt contains feature
            D = 0  # num of tweets not in target cat that doesnt contains feature

            if f in target_senti_feat_dict:
                A = target_senti_feat_dict[f]
            if f in other_senti_feat_dict:
                B = other_senti_feat_dict[f]

            C = target_senti_sample_size - A
            D = other_senti_sample_size - B

            chi2_word_val[f] = calculateChi2(A, B, C, D)


#             print f,A,B,C,D, chi2_word_val[f]
#             break

        chi2_val_dict[target_sentiment] = chi2_word_val
    return chi2_val_dict
예제 #15
0
def getKeyInfoForClassifier(filename,
                            categories_list,
                            groundtruth_list,
                            features=FEATURES_DEFAULT):
    '''
    Extracts all features from input files.
    
    Params:
        filename: input DATA file
        categories_list: list of categories, e.g ['Apple', 'Google', 'Twitter']
        groundtruth_list: list of groundtruths, e.g [{CATEGORY: category, POLARITY: polarity, TWEET_ID: tweetid}]
        features (optional):     [FEATURE_TEXT, FEATURE_HASHTAG, FEATURE_GEOINFO, FEATURE_FOLLOWED_CATEGORIES,
                                FEATURE_USER, FEATURE_USER_MENTIONS]
    
    Returns: 
        {
            'category' : {
                POSITIVE:{
                    PROCESSED_TWEETS : [{
                        TWEET_FULL: This was a triumph, 
                        TWEET_FEATURES: {
                            FEATURE_TEXT: {} , 
                            FEATURE_GEOLOCATION : str
                        }
                        
                    }],
                    FEATURES: {
                        FEATURE_TEXT: {} ...
                    }
                },
                
                NEGATIVE:{
                    PROCESSED_TWEETS : [{
                        TWEET_FULL: This was a triumph, 
                        TWEET_FEATURES: {
                            FEATURE_TEXT: {} , 
                            FEATURE_GEOLOCATION : str
                        }
                        
                    }],
                    FEATURES: {
                        FEATURE_TEXT: {} ...
                        FEATURE_HASHTAG: []
                    }
                },
                
                UNIQUE_FEATURES: {
                    FEATURE_TEXT: {} , ... 
                }
            }
        }
    '''

    returnmap = {}

    for category in categories_list:
        # category wide variables
        processed_tweets_list = []
        unique_features_map = {}

        positive_processed_tweet_list = []
        negative_processed_tweet_list = []

        positive_features_map = {}
        negative_features_map = {}

        # Initialize unique feature maps within category
        for feature in features:
            if feature == FEATURE_TEXT:
                feature_text_unique = {}
                pos_feature_text_unique = {}
                neg_feature_text_unique = {}

            elif feature == FEATURE_HASHTAG:
                feature_hashtag_unique = {}
                pos_feature_hashtag_unique = {}
                neg_feature_hashtag_unique = {}

            elif feature == FEATURE_GEOINFO:
                feature_geoinfo_unique = {}
                pos_feature_geoinfo_unique = {}
                neg_feature_geoinfo_unique = {}

            elif feature == FEATURE_FOLLOWED_CATEGORIES:
                feature_followed_cat_unique = {}
                pos_feature_followed_cat_unique = {}
                neg_feature_followed_cat_unique = {}

            elif feature == FEATURE_USER:
                feature_user_unique = {}
                pos_feature_user_unique = {}
                neg_feature_user_unique = {}

            elif feature == FEATURE_USER_MENTIONS:
                feature_usermentions_unique = {}
                pos_feature_usermentions_unique = {}
                neg_feature_usermentions_unique = {}

            elif feature == FEATURE_CATEGORY:
                feature_category_unique = {}
                pos_feature_category_unique = {}
                neg_feature_category_unique = {}

        # Extract & Process Features
        with codecs.open(filename, encoding='cp1252') as k:
            for index, line in enumerate(k):
                json_data = json.loads(line, encoding='cp1252')
                tweet_keyinfo = extractFeaturesFromTweet(
                    json_data, categories_list, features, category)

                if groundtruth_list[index][CATEGORY] == category:
                    positive_processed_tweet_list.append(tweet_keyinfo)
                else:
                    negative_processed_tweet_list.append(tweet_keyinfo)

                processed_tweets_list.append(tweet_keyinfo)

        # check unique tweet_keyinfo across positive, negative and all
        for tweet_keyinfo in positive_processed_tweet_list:
            for feature in features:
                if feature == FEATURE_TEXT:
                    text_count_dict = tweet_keyinfo[TWEET_FEATURES][
                        FEATURE_TEXT]
                    for key, count in text_count_dict.iteritems():
                        addToCountDict(pos_feature_text_unique, key, 1)

                elif feature == FEATURE_HASHTAG:
                    for hashtag in tweet_keyinfo[TWEET_FEATURES][
                            FEATURE_HASHTAG]:
                        addToCountDict(pos_feature_hashtag_unique, hashtag, 1)

                elif feature == FEATURE_GEOINFO:
                    if tweet_keyinfo[TWEET_FEATURES][FEATURE_GEOINFO] != '':
                        addToCountDict(
                            pos_feature_geoinfo_unique,
                            tweet_keyinfo[TWEET_FEATURES][FEATURE_GEOINFO], 1)

                elif feature == FEATURE_FOLLOWED_CATEGORIES:
                    # TODO: Consider implementing
                    pass

                elif feature == FEATURE_USER:
                    addToCountDict(pos_feature_user_unique,
                                   tweet_keyinfo[TWEET_FEATURES][FEATURE_USER],
                                   1)

                elif feature == FEATURE_USER_MENTIONS:
                    for usermention in tweet_keyinfo[TWEET_FEATURES][
                            FEATURE_USER_MENTIONS]:
                        addToCountDict(pos_feature_usermentions_unique,
                                       usermention, 1)

                elif feature == FEATURE_CATEGORY:
                    for item in tweet_keyinfo[TWEET_FEATURES][
                            FEATURE_CATEGORY]:
                        addToCountDict(pos_feature_category_unique, item, 1)

        # debug files will be written only if settings.DEBUG_CODE = True


#         writeDebugCountDictToFile("%s_pos_feature_text_unique.txt" % category, pos_feature_text_unique)
#         writeDebugCountDictToFile("%s_pos_feature_hashtag_unique.txt" % category, pos_feature_hashtag_unique)
#         writeDebugCountDictToFile("%s_pos_feature_geoinfo_unique.txt" % category, pos_feature_geoinfo_unique)
#         writeDebugCountDictToFile("%s_pos_feature_user_unique.txt" % category, pos_feature_user_unique)
#         writeDebugCountDictToFile("%s_pos_feature_usermentions_unique.txt" % category, pos_feature_usermentions_unique)

        for tweet_keyinfo in negative_processed_tweet_list:
            for feature in features:
                if feature == FEATURE_TEXT:
                    text_count_dict = tweet_keyinfo[TWEET_FEATURES][
                        FEATURE_TEXT]
                    for key, count in text_count_dict.iteritems():
                        addToCountDict(neg_feature_text_unique, key, 1)
                elif feature == FEATURE_HASHTAG:
                    for hashtag in tweet_keyinfo[TWEET_FEATURES][
                            FEATURE_HASHTAG]:
                        addToCountDict(neg_feature_hashtag_unique, hashtag, 1)
                elif feature == FEATURE_GEOINFO:
                    addToCountDict(
                        neg_feature_geoinfo_unique,
                        tweet_keyinfo[TWEET_FEATURES][FEATURE_GEOINFO], 1)
                elif feature == FEATURE_FOLLOWED_CATEGORIES:
                    # TODO: Consider implementing
                    pass
                elif feature == FEATURE_USER:
                    addToCountDict(neg_feature_user_unique,
                                   tweet_keyinfo[TWEET_FEATURES][FEATURE_USER],
                                   1)

                elif feature == FEATURE_USER_MENTIONS:
                    for usermention in tweet_keyinfo[TWEET_FEATURES][
                            FEATURE_USER_MENTIONS]:
                        addToCountDict(neg_feature_usermentions_unique,
                                       usermention, 1)

                elif feature == FEATURE_CATEGORY:
                    for item in tweet_keyinfo[TWEET_FEATURES][
                            FEATURE_CATEGORY]:
                        addToCountDict(neg_feature_category_unique, item, 1)

        for tweet_keyinfo in processed_tweets_list:
            for feature in features:
                if feature == FEATURE_TEXT:
                    text_count_dict = tweet_keyinfo[TWEET_FEATURES][
                        FEATURE_TEXT]
                    for key, count in text_count_dict.iteritems():
                        addToCountDict(feature_text_unique, key, 1)

                elif feature == FEATURE_HASHTAG:
                    for hashtag in tweet_keyinfo[TWEET_FEATURES][
                            FEATURE_HASHTAG]:
                        addToCountDict(feature_hashtag_unique, hashtag, 1)
                elif feature == FEATURE_GEOINFO:
                    addToCountDict(
                        feature_geoinfo_unique,
                        tweet_keyinfo[TWEET_FEATURES][FEATURE_GEOINFO], 1)
                elif feature == FEATURE_FOLLOWED_CATEGORIES:
                    # TODO: Consider implementing
                    pass
                elif feature == FEATURE_USER:
                    addToCountDict(feature_user_unique,
                                   tweet_keyinfo[TWEET_FEATURES][FEATURE_USER],
                                   1)
                elif feature == FEATURE_USER_MENTIONS:
                    for usermention in tweet_keyinfo[TWEET_FEATURES][
                            FEATURE_USER_MENTIONS]:
                        addToCountDict(feature_usermentions_unique,
                                       usermention, 1)

                elif feature == FEATURE_CATEGORY:
                    for item in tweet_keyinfo[TWEET_FEATURES][
                            FEATURE_CATEGORY]:
                        addToCountDict(feature_category_unique, item, 1)

        for feature in features:
            if feature == FEATURE_TEXT:
                unique_features_map[FEATURE_TEXT] = feature_text_unique
                positive_features_map[FEATURE_TEXT] = pos_feature_text_unique
                negative_features_map[FEATURE_TEXT] = neg_feature_text_unique
            elif feature == FEATURE_HASHTAG:
                unique_features_map[FEATURE_HASHTAG] = feature_hashtag_unique
                positive_features_map[
                    FEATURE_HASHTAG] = pos_feature_hashtag_unique
                negative_features_map[
                    FEATURE_HASHTAG] = neg_feature_hashtag_unique
            elif feature == FEATURE_GEOINFO:
                unique_features_map[FEATURE_GEOINFO] = feature_geoinfo_unique
                positive_features_map[
                    FEATURE_GEOINFO] = pos_feature_geoinfo_unique
                negative_features_map[
                    FEATURE_GEOINFO] = neg_feature_geoinfo_unique
            elif feature == FEATURE_FOLLOWED_CATEGORIES:
                unique_features_map[
                    FEATURE_FOLLOWED_CATEGORIES] = feature_followed_cat_unique
                positive_features_map[
                    FEATURE_FOLLOWED_CATEGORIES] = pos_feature_followed_cat_unique
                negative_features_map[
                    FEATURE_FOLLOWED_CATEGORIES] = neg_feature_followed_cat_unique
            elif feature == FEATURE_USER:
                unique_features_map[FEATURE_USER] = feature_user_unique
                positive_features_map[FEATURE_USER] = pos_feature_user_unique
                negative_features_map[FEATURE_USER] = neg_feature_user_unique
            elif feature == FEATURE_USER_MENTIONS:
                unique_features_map[
                    FEATURE_USER_MENTIONS] = feature_usermentions_unique
                positive_features_map[
                    FEATURE_USER_MENTIONS] = pos_feature_usermentions_unique
                negative_features_map[
                    FEATURE_USER_MENTIONS] = neg_feature_usermentions_unique
            elif feature == FEATURE_CATEGORY:
                unique_features_map[FEATURE_CATEGORY] = feature_category_unique
                positive_features_map[
                    FEATURE_CATEGORY] = pos_feature_category_unique
                negative_features_map[
                    FEATURE_CATEGORY] = neg_feature_category_unique

        returnmap[category] = {}
        returnmap[category][POSITIVE] = {
            PROCESSED_TWEETS: positive_processed_tweet_list,
            FEATURES: positive_features_map
        }
        returnmap[category][NEGATIVE] = {
            PROCESSED_TWEETS: negative_processed_tweet_list,
            FEATURES: negative_features_map
        }
        returnmap[category][UNIQUE_FEATURES] = unique_features_map

    return returnmap