def features_for_tweet(self, tweet_repr, sid):

        """
        Model::features_for_tweet()

        Purpose: Generate features for a single tweet

        @param tweet. A 3-tuple representing a tweet
        @return       A hash table of features
        """

        # data
        begin    = tweet_repr[0]
        end      = tweet_repr[1]
        sentence = [ unicode(t.decode('utf-8')) for t in tweet_repr[2] ]
        phrase   = sentence[begin:end+1]


        # Feature Dictionary
        features = {}


        # Normalize all text (tokenizer, stem, etc)
        corrected = sentence
        normalized = utilities.normalize_phrase_TaskA(corrected,ark_tweet=self.ark_tweet)
        flat_normed = [ w for words in normalized for w in words ]


        # Feature: unedited term unigrams
        for tok in phrase:
            if tok == '': continue
            if tok in utilities.stop_words:      continue
            features[('unedited-uni-tok',tok)] = 1


        # Term unigrams
        for tok in normalized[begin:end+1]:
            for word in tok:
                if word == '': continue
                if word[0] == '#': continue
                base  = word if (word[-4:]!='_neg') else word[:-4] 
                if base in utilities.stop_words:      continue

                if base.lower() in common:
                    toks = common[base.lower()].split()
                else:
                    toks = [base]

                for t in toks:
                    w = st.stem(t)
                    if word[-4:] == '_neg':
                        w += '_neg'

                    weight = 1
                    if utilities.is_elongated_word(base): weight += 1

                    features[('stemmed_term_unigram', w)] = weight


        # Unigram context window
        window = 3
        prefix_start = max(begin-window, 0)
        context = sentence[prefix_start:end+1+window]

        norm_context = [ w for t in normalized[begin:end+1] for w in t ]

        prefix_terms = []
        for w in reversed([w for t in normalized[prefix_start:begin]for w in t]):
            if w == '': break
            prefix_terms.append(w)
        norm_context = list(reversed(prefix_terms)) + norm_context
 
        suffix_terms = []
        for w in [ w for t in normalized[end+1:end+1+window] for w in t ]:
            if w == '': break
            suffix_terms.append(w)
        norm_context = norm_context + suffix_terms


        # Feature: Unigram context
        # Leading
        for word in norm_context:
            if word == '': continue
            w  = word if (word[-4:]!='_neg') else word[:-4] 
            if w in utilities.stop_words:      continue
            w = st.stem(self.speller.correct_spelling([w])[0])
            if word[-4:] == '_neg':
                w += '_neg'
            features[('leading_unigram', w)] = 1

        '''
        print sentence
        print phrase
        for k,v in features.items():
            print '\t', k, '\t', v
        return features
        '''

        # Feature: Lexicon Features
        if enabled_modules['lexicons']:
            #print '\n\n\n'
            #print 'LEX FEATS: ', sentence
            #print begin, end
            # Phrase in question
            lex_feats = lexicon_features(sentence,begin,end+1,ark_tweet=self.ark_tweet)
            context_feats = lexicon_features(sentence,prefix_start,end+1+window,ark_tweet=self.ark_tweet)
            features.update(lex_feats)

            '''
            # Leading context
            prev_lex_feats = lexicon_features(sentence,prefix_start,begin, ark_tweet=self.ark_tweet)
            prev_lex_feats = {('prev-'+k[0],k[1]):v for k,v in prev_lex_feats.items()}
            features.update(prev_lex_feats)

            # Trailing context
            next_lex_feats = lexicon_features(sentence,end+1,end+1+window, ark_tweet=self.ark_tweet)
            next_lex_feats = {('next-'+k[0],k[1]):v for k,v in next_lex_feats.items()}
            features.update(next_lex_feats)
            '''


            #print phrase
            #for k,v in lex_feats.items():
            #    print '\t', k, '\t', v
            #print

            #print lex_feats
            #print prev_lex_feats
            #print next_lex_feats


        # Feature: Split hashtag
        if enabled_modules['hashtag']:
            hashtags = [ w for w in context if len(w) and (w[0]=='#') ]
            for ht in hashtags:
                toks = hashtag.split_hashtag(ht)
                for tok in utilities.normalize_phrase_TaskB(toks):
                    w = tok if tok[-4:]!='_neg' else tok[:-4]
                    stemmed = st.stem(w)
                    if tok[-4:] == '_neg': stemmed += '_neg'
                    if len(w) < 2: continue
                    if w in utilities.stop_words: continue
                    features[('stemmed_term_unigram',stemmed)] = 1


        #print
        #print sentence
        #print begin
        #print end
        #print phrase

        # Feature: Prefixes and Suffixes
        n = [2,3,4]
        for i,words in enumerate(normalized[begin:end+1]):
            for word in words:
                if len(word) < 2: continue
                for j in n:
                    if word[-4:] == '_neg': word = word[:-4]

                    prefix = word[:j ]
                    suffix = word[-j:]

                    #print '\tprefix: ', prefix
                    #print '\tsuffix: ', suffix
                    features[ ('prefix',prefix) ] = 1
                    features[ ('suffix',suffix) ] = 1


        # Features: Special forms
        if any([ utilities.is_url(w) for w in phrase]):
            features[ ('contains_url',None) ] = 1
        if any([ w and w[0]=='@'     for w in phrase]):
            features[ ('contains_@'  ,None) ] = 1
        if any([ w and w[0] == '#'   for w in phrase]):
            features[ ('contains_#'  ,None) ] = 1



        # Features: Misc position data
        features['first_unigram'] = sentence[begin]
        features[ 'last_unigram'] = sentence[  end]
        features['phrase_length'] = len(phrase) / 2.0
        features['is_first'] = (begin == 0)
        features['is_last'] = (end == len(sentence)-1)


        # Feature: Whether every word is a stop word
        if all([ tok in utilities.stop_words for tok in phrase]):
            #print phrase
            features[ ('all_stopwords',None) ] = 1


        # Feature: All Caps? (boolean)
        if re.search('^[^a-z]*[A-Z][A-Z][^a-z]$',''.join(phrase)):
            features[ ('all_caps',None) ] = 1


        # Feature: All Punctuation?
        if re.search('^[^a-zA-Z0-9]+$',''.join(phrase)):
            features[ ('all_punct',None) ] = 1


        # Feature: Emoticon Counts
        elabels = defaultdict(lambda:0)
        for word in norm_context:
            elabel = emoticons.emoticon_type(word)
            if elabel:
                elabels[elabel] += 1
        for k,v in elabels.items():
            featname = k + '-emoticon'
            features[featname] = v


        # Feature: Punctuation counts
        punct = {'!':0, '?':0, '.':0}
        for c in ''.join(context):
            if c in punct: punct[c] += 1
        for k,v in punct.items():
            featname = k + '-count'
            features[featname] = v


        # Features: character streaks
        text = ''.join(phrase)

        #  !-streak
        matches = re.findall('!+',text)
        if matches:
            features['!-streak'] = max([len(w) for w in matches])

        #  ?-streak
        matches = re.findall('\\?+',text)
        if matches:
            features['?-streak'] = max([len(w) for w in matches])

        # ?!-streak
        matches = re.findall('[!\\?]+',text)
        if matches:
            features['?!-streak'] = max([len(w) for w in matches])


        # Feature: Contains elongated long word? (boolean)
        contains_elongated_word = False
        for word in phrase:
            if utilities.is_elongated_word(word):
                contains_elongated_word = True
        if contains_elongated_word:
            features[ ('contains_elongated_word',None) ] = 1


        # Feature: Contains long word? (boolean)
        long_word_threshold = 10
        contains_long_word = False
        for words in normalized[begin:end+1]:
            for word in words:
                if word[-4:]=='_neg': word = word[:-4]
                word = spell.remove_duplicates(word)
                if len(word) and word[0]=='#': continue
                word = word.strip(string.punctuation)
                if len(word) > long_word_threshold:
                    contains_long_word = True
        if contains_long_word:
            features[ ('contains_long_word',None) ] = 1



        return features
    def features_for_tweet(self, tweet, sid):

        """
        Model::features_for_tweet()

        Purpose: Generate features for a single tweet

        @param tweet. A string (the text of a tweet)
        @param sid.   An int   (the ID   of a tweet)
        @return       A hash table of features
        """

        # Feature dictionary
        features = {}

        # POS list
        if enabled_modules['ark_tweet']:
            pos = self.ark_tweet.posTags(tweet)
        else:
            pos = None

        # Tweet representation (list of tokens/strings)
        phrase = utilities.tokenize(tweet, self.ark_tweet)


        '''
        # Feature: Unedited Unigram Tokens
        for tok in phrase:
            if tok == '': continue
            if tf_idf.doc_freq(tok) < MIN_COUNT: continue
            if tok in tf_idf.stop_words:         continue
            features[('unedited-uni-tok',tok)] = 1
        '''

        # Edit misspellings
        unis = self.speller.correct_spelling(phrase, pos)

        # Flatten from multi-word tokens
        flattened = []
        flat_pos = []
        for tok,tag in zip(unis,pos):
            for w in tok.split():
                flattened.append(w)
                flat_pos.append(tag)

        # Normalize sentence
        normalized = utilities.normalize_phrase_TaskB(flattened)


        # Feature: Processed Unigram Tokens
        uni_freqs = defaultdict(lambda:0)
        for i,word in enumerate(normalized):

            if word == '': continue
            w = word if (word[-4:]!='_neg') else word[:-4]
            if tf_idf.doc_freq(w) < MIN_COUNT: continue
            if w in tf_idf.stop_words:         continue

            # Exclude proper nouns and prepositions
            if flat_pos:
                if flat_pos[i] == '^': continue
                if flat_pos[i] == 'Z': continue
                if flat_pos[i] == 'P': continue
                if flat_pos[i] == 'O': continue
                uni_freqs[word] += 1
            else:
                uni_freqs[word] += 1

        feats = defaultdict(lambda:0)
        for key,tf in uni_freqs.items():
            word = key
            if word[-4:] == '_neg':
                word = word[:-4]
                score = -1
            else:
                score = 1
            #feats[('uni_tok'     ,        word) ] += score
            feats[('uni_stem_tok',st.stem(word))] += score
        features.update(feats)

        return features

        #'''
        # Feature: Split hashtag
        if enabled_modules['hashtag']:
            hashtags = [ w for w in normalized if len(w) and (w[0]=='#') ]
            for ht in hashtags:
                toks = hashtag.split_hashtag(ht)
                if (ht not in seen) and (ht not in hashtag.annotations):
                    seen.add(ht)
                    #print ht, '\t', toks
                for tok in utilities.normalize_phrase_TaskB(toks):
                    if tok[-4:] == '_neg':
                        tok = tok[:-4]
                        score = -1
                    else:
                        score = 1
                    if len(tok) > 2:
                        if tf_idf.doc_freq(tok) < MIN_COUNT: continue
                        if tok in tf_idf.stop_words:         continue
                        ###features[('uni_tok'     ,        tok) ] = score
                        features[('uni_stem_tok',st.stem(tok))] = score
        #'''

        #return features

        # Feature: Lexicon Features
        if enabled_modules['lexicons']:
            feats = lexicon_features(normalized)
            features.update(feats)

        return features

        # Feature: Punctuation counts
        for c in '!?':
            val = tweet.count(c)
            if val > 0:
                features['%s-count' % c] = val


        # Features: Text lengths
        #features['phrase_length']   = len(tweet) / 140.0


        # Feature: Contains long word? (boolean)
        long_word_threshold = 8
        contains_long_word = False
        for word in phrase:
            if len(word) == 0: continue
            if word[0] == '@': continue
            if len(word) > long_word_threshold:
                contains_long_word = True
                break
        if contains_long_word:
            features['contains_long_word'] = 1


        # Feature: Emoticon Counts
        elabels = { 'positive':0, 'negative':0, 'neutral':0 }
        for word in phrase:
            elabel = emoticons.emoticon_type(word)
            if elabel:
                elabels[elabel] += 1
        for k,v in elabels.items():
            if v > 0:
                featname = k + '-emoticon'
                features[featname] = v


        # Features: contains twitter-specific features (hashtags & mentions)
        contains_hashtag = False
        contains_mention = False
        for tok in phrase:
            if tok == '': continue
            if tok[0] == '@': contains_mention = True
            if tok[0] == '#': contains_hashtag = True
        if contains_hashtag: features['contains_hashtag'] = 1
        if contains_mention: features['contains_mention'] = 1


        return features


        # Feature: Bigram Tokens
        flattened = []
        for tok in normalized:
            flattened += tok.split()
        for i in range(len(flattened)-1):
            bigram  = tuple(flattened[i:i+2])

            # short circuits
            if any(w == ''                        for w in bigram): continue
            if any(tf_idf.doc_freq(w) < MIN_COUNT for w in bigram): continue
            if any(w in tf_idf.stop_words         for w in bigram): continue

            # context
            t1,t2 = bigram
            if t1[-4:] == '_neg':
                t1 = t1[:-4]
                score = -1
            else:
                score = 1
            if t2[-4:] == '_neg':
                t2 = t2[:-4]

            sbigram = (st.stem(t1),st.stem(t2))
            features[( 'bigram_tok',(t1,t2))] = score
            features[('sbigram_tok',sbigram)] = score



        # Feature: Trigram Tokens
        for i in range(len(flattened)-2):
            trigram  = tuple(flattened[i:i+3])
            if any(w == '' for w in trigram): continue
            if any(tf_idf.doc_freq(phrase[i]) < MIN_COUNT for w in range(3)): continue
            if phrase[i] in tf_idf.stop_words:      continue
            t1,t2,t3 = trigram
            if t1[-4:] == '_neg':
                t1 = t1[:-4]
                score = -1
            else:
                score = 1
            if t2[-4:] == '_neg':
                t2 = t2[:-4]
            if t3[-4:] == '_neg':
                t3 = t3[:-4]

            features[('trigram_tok',trigram)] = 1
            #features[('strigram_tok',strigram)] = 1


        # Feature: ark_tweet features (cached based on unescaped text)
        if enabled_modules['ark_tweet']:
            ark_feats = self.ark_tweet.features(tweet)
            features.update(ark_feats)


        '''
        # Feature: twitter_data features
        if enabled_modules['twitter_data']:
            tdata_feats = self.twitter_data.features(sid)
            features.update(tdata_feats)


        # Feature: URL Features
        if enabled_modules['url']:
            urls = [  w  for  w  in  phrase  if  utilities.is_url(w)  ]
            for url in urls:
                feats = self.url.features(url)
                features.update(feats)


        '''


        if enabled_modules['ukb_wsd'] and enabled_modules['ark_tweet']:
            #add ukb wsd features
            if self.ukb.cache.has_key( tweet ):
                wordSenses = self.ukb.cache.get_map( tweet )
            else:
                #print tweet
                wordSenses = self.ukb.ukb_wsd( phrase , self.ark_tweet.posTags( tweet ) )
                self.ukb.cache.add_map( tweet , wordSenses )

            for ws in wordSenses:
                for s in ws:
                    if ('wsd',s[0]) in features.keys():
                        features[('wsd',s[0])] += s[1]
                    else:
                        features[('wsd',s[0])] = s[1]


        #print '\n\n\n'
        #print tweet
        #print
        #print features

        return features
Exemplo n.º 3
0
    def features_for_tweet(self, tweet, sid):

        """
        Model::features_for_tweet()

        Purpose: Generate features for a single tweet

        @param tweet. A string (the text of a tweet)
        @param sid.   An int   (the ID   of a tweet)
        @return       A hash table of features
        """

        # Feature dictionary
        features = {}

        # POS list
        if enabled_modules['ark_tweet']:
            pos = self.ark_tweet.posTags(tweet)
        else:
            pos = None

        # Tweet representation (list of tokens/strings)
        phrase = utilities.tokenize(tweet, self.ark_tweet)


        #'''
        # Feature: Unedited Unigram Tokens
        for tok in phrase:
            if tok == '': continue
            if tf_idf.doc_freq(tok) < MIN_COUNT: continue
            if tok in tf_idf.stop_words:         continue
            features[('unedited-uni-tok',tok)] = 1
        #'''

        # Edit misspellings
        unis = self.speller.correct_spelling(phrase, pos)

        # Flatten from multi-word tokens
        if pos:
            flattened = []
            flat_pos = []
            for tok,tag in zip(unis,pos):
                for w in tok.split():
                    flattened.append(w)
                    flat_pos.append(tag)
        else:
            flattened = unis
            flat_pos  = None


        # Normalize sentence
        normalized = utilities.normalize_phrase_TaskB(flattened)


        # Feature: Processed Unigram Tokens
        uni_freqs = defaultdict(lambda:0)
        for i,word in enumerate(normalized):

            if word == '': continue
            w = word if (word[-4:]!='_neg') else word[:-4]
            if tf_idf.doc_freq(w) < MIN_COUNT: continue
            if w in tf_idf.stop_words:         continue

            # Exclude proper nouns and prepositions
            if flat_pos:
                if flat_pos[i] == '^': continue
                if flat_pos[i] == 'Z': continue
                if flat_pos[i] == 'P': continue
                if flat_pos[i] == 'O': continue
                uni_freqs[word] += 1
            else:
                uni_freqs[word] += 1

        feats = defaultdict(lambda:0)
        for key,tf in uni_freqs.items():
            word = key
            if word[-4:] == '_neg':
                word = word[:-4]
                score = -1
            else:
                score = 1
            feats[('uni_tok'     ,        word) ] += score
            feats[('uni_stem_tok',st.stem(word))] += score
        features.update(feats)

        #return features

        #'''
        # Feature: Split hashtag
        if enabled_modules['hashtag']:
            hashtags = [ w for w in normalized if len(w) and (w[0]=='#') ]
            for ht in hashtags:
                toks = hashtag.split_hashtag(ht)
                if (ht not in seen) and (ht not in hashtag.annotations):
                    seen.add(ht)
                    #print ht, '\t', toks
                for tok in utilities.normalize_phrase_TaskB(toks):
                    if tok[-4:] == '_neg':
                        tok = tok[:-4]
                        score = -1
                    else:
                        score = 1
                    if len(tok) > 2:
                        if tf_idf.doc_freq(tok) < MIN_COUNT: continue
                        if tok in tf_idf.stop_words:         continue
                        features[('uni_tok'     ,        tok) ] = score
                        features[('uni_stem_tok',st.stem(tok))] = score
        #'''

        #return features

        # Feature: Lexicon Features
        if enabled_modules['lexicons']:
            feats = lexicon_features(normalized)
            features.update(feats)

        #return features

        # Feature: Punctuation counts
        for c in '!?':
            val = tweet.count(c)
            if val > 0:
                features['%s-count' % c] = val


        # Features: Text lengths
        #features['phrase_length']   = len(tweet) / 140.0


        # Feature: Contains long word? (boolean)
        long_word_threshold = 8
        contains_long_word = False
        for word in phrase:
            if len(word) == 0: continue
            if word[0] == '@': continue
            if len(word) > long_word_threshold:
                contains_long_word = True
                break
        if contains_long_word:
            features['contains_long_word'] = 1


        # Feature: Emoticon Counts
        elabels = { 'positive':0, 'negative':0, 'neutral':0 }
        for word in phrase:
            elabel = emoticons.emoticon_type(word)
            if elabel:
                elabels[elabel] += 1
        for k,v in elabels.items():
            if v > 0:
                featname = k + '-emoticon'
                features[featname] = v


        # Features: contains twitter-specific features (hashtags & mentions)
        contains_hashtag = False
        contains_mention = False
        for tok in phrase:
            if tok == '': continue
            if tok[0] == '@': contains_mention = True
            if tok[0] == '#': contains_hashtag = True
        if contains_hashtag: features['contains_hashtag'] = 1
        if contains_mention: features['contains_mention'] = 1


        #return features


        # Feature: Bigram Tokens
        flattened = []
        for tok in normalized:
            flattened += tok.split()
        for i in range(len(flattened)-1):
            bigram  = tuple(flattened[i:i+2])

            # short circuits
            if any(w == ''                        for w in bigram): continue
            if any(tf_idf.doc_freq(w) < MIN_COUNT for w in bigram): continue
            if any(w in tf_idf.stop_words         for w in bigram): continue

            # context
            t1,t2 = bigram
            if t1[-4:] == '_neg':
                t1 = t1[:-4]
                score = -1
            else:
                score = 1
            if t2[-4:] == '_neg':
                t2 = t2[:-4]

            sbigram = (st.stem(t1),st.stem(t2))
            features[( 'bigram_tok',(t1,t2))] = score
            features[('sbigram_tok',sbigram)] = score

        return features


        # Feature: Trigram Tokens
        for i in range(len(flattened)-2):
            trigram  = tuple(flattened[i:i+3])
            if any(w == '' for w in trigram): continue
            if any(tf_idf.doc_freq(phrase[i]) < MIN_COUNT for w in range(3)): continue
            if phrase[i] in tf_idf.stop_words:      continue
            t1,t2,t3 = trigram
            if t1[-4:] == '_neg':
                t1 = t1[:-4]
                score = -1
            else:
                score = 1
            if t2[-4:] == '_neg':
                t2 = t2[:-4]
            if t3[-4:] == '_neg':
                t3 = t3[:-4]

            features[('trigram_tok',trigram)] = 1
            #features[('strigram_tok',strigram)] = 1

        return features

        # Feature: ark_tweet features (cached based on unescaped text)
        if enabled_modules['ark_tweet']:
            ark_feats = self.ark_tweet.features(tweet)
            features.update(ark_feats)


        return features