def light_normalize(sentence, begin, end, ark_tweet): # Normalize phrase normalized = normalize_phrase_TaskA(sentence, ark_tweet) # Get given phrase phrase = [w.lower() for words in normalized[begin:end] for w in words] # Common spelling mistakes retVal = [] for t in phrase: if t == '': continue negated = ('_neg' in t) if negated: t = t[:-4] key = t.strip(string.punctuation) if key in common: abbrv = common[key].split() if negated: abbrv = [w + '_neg' for w in abbrv] retVal += abbrv else: if negated: key += '_neg' retVal.append(key) if t[0] == '#': retVal.append(t) phrase = retVal return phrase
def light_normalize(sentence, begin, end, ark_tweet): # Normalize phrase normalized = normalize_phrase_TaskA(sentence, ark_tweet) # Get given phrase phrase = [ w.lower() for words in normalized[begin:end] for w in words ] # Common spelling mistakes retVal = [] for t in phrase: if t == '': continue negated = ('_neg' in t) if negated: t = t[:-4] key = t.strip(string.punctuation) if key in common: abbrv = common[key].split() if negated: abbrv = [ w+'_neg' for w in abbrv ] retVal += abbrv else: if negated: key += '_neg' retVal.append(key) if t[0] == '#': retVal.append(t) phrase = retVal return phrase
def features_for_tweet(self, tweet_repr, sid): """ Model::features_for_tweet() Purpose: Generate features for a single tweet @param tweet. A 3-tuple representing a tweet @return A hash table of features """ # data begin = tweet_repr[0] end = tweet_repr[1] sentence = [ unicode(t.decode('utf-8')) for t in tweet_repr[2] ] phrase = sentence[begin:end+1] # Feature Dictionary features = {} # Normalize all text (tokenizer, stem, etc) corrected = sentence normalized = utilities.normalize_phrase_TaskA(corrected,ark_tweet=self.ark_tweet) flat_normed = [ w for words in normalized for w in words ] # Feature: unedited term unigrams for tok in phrase: if tok == '': continue if tok in utilities.stop_words: continue features[('unedited-uni-tok',tok)] = 1 # Term unigrams for tok in normalized[begin:end+1]: for word in tok: if word == '': continue if word[0] == '#': continue base = word if (word[-4:]!='_neg') else word[:-4] if base in utilities.stop_words: continue if base.lower() in common: toks = common[base.lower()].split() else: toks = [base] for t in toks: w = st.stem(t) if word[-4:] == '_neg': w += '_neg' weight = 1 if utilities.is_elongated_word(base): weight += 1 features[('stemmed_term_unigram', w)] = weight # Unigram context window window = 3 prefix_start = max(begin-window, 0) context = sentence[prefix_start:end+1+window] norm_context = [ w for t in normalized[begin:end+1] for w in t ] prefix_terms = [] for w in reversed([w for t in normalized[prefix_start:begin]for w in t]): if w == '': break prefix_terms.append(w) norm_context = list(reversed(prefix_terms)) + norm_context suffix_terms = [] for w in [ w for t in normalized[end+1:end+1+window] for w in t ]: if w == '': break suffix_terms.append(w) norm_context = norm_context + suffix_terms # Feature: Unigram context # Leading for word in norm_context: if word == '': continue w = word if (word[-4:]!='_neg') else word[:-4] if w in utilities.stop_words: continue w = st.stem(self.speller.correct_spelling([w])[0]) if word[-4:] == '_neg': w += '_neg' features[('leading_unigram', w)] = 1 ''' print sentence print phrase for k,v in features.items(): print '\t', k, '\t', v return features ''' # Feature: Lexicon Features if enabled_modules['lexicons']: #print '\n\n\n' #print 'LEX FEATS: ', sentence #print begin, end # Phrase in question lex_feats = lexicon_features(sentence,begin,end+1,ark_tweet=self.ark_tweet) context_feats = lexicon_features(sentence,prefix_start,end+1+window,ark_tweet=self.ark_tweet) features.update(lex_feats) ''' # Leading context prev_lex_feats = lexicon_features(sentence,prefix_start,begin, ark_tweet=self.ark_tweet) prev_lex_feats = {('prev-'+k[0],k[1]):v for k,v in prev_lex_feats.items()} features.update(prev_lex_feats) # Trailing context next_lex_feats = lexicon_features(sentence,end+1,end+1+window, ark_tweet=self.ark_tweet) next_lex_feats = {('next-'+k[0],k[1]):v for k,v in next_lex_feats.items()} features.update(next_lex_feats) ''' #print phrase #for k,v in lex_feats.items(): # print '\t', k, '\t', v #print #print lex_feats #print prev_lex_feats #print next_lex_feats # Feature: Split hashtag if enabled_modules['hashtag']: hashtags = [ w for w in context if len(w) and (w[0]=='#') ] for ht in hashtags: toks = hashtag.split_hashtag(ht) for tok in utilities.normalize_phrase_TaskB(toks): w = tok if tok[-4:]!='_neg' else tok[:-4] stemmed = st.stem(w) if tok[-4:] == '_neg': stemmed += '_neg' if len(w) < 2: continue if w in utilities.stop_words: continue features[('stemmed_term_unigram',stemmed)] = 1 #print #print sentence #print begin #print end #print phrase # Feature: Prefixes and Suffixes n = [2,3,4] for i,words in enumerate(normalized[begin:end+1]): for word in words: if len(word) < 2: continue for j in n: if word[-4:] == '_neg': word = word[:-4] prefix = word[:j ] suffix = word[-j:] #print '\tprefix: ', prefix #print '\tsuffix: ', suffix features[ ('prefix',prefix) ] = 1 features[ ('suffix',suffix) ] = 1 # Features: Special forms if any([ utilities.is_url(w) for w in phrase]): features[ ('contains_url',None) ] = 1 if any([ w and w[0]=='@' for w in phrase]): features[ ('contains_@' ,None) ] = 1 if any([ w and w[0] == '#' for w in phrase]): features[ ('contains_#' ,None) ] = 1 # Features: Misc position data features['first_unigram'] = sentence[begin] features[ 'last_unigram'] = sentence[ end] features['phrase_length'] = len(phrase) / 2.0 features['is_first'] = (begin == 0) features['is_last'] = (end == len(sentence)-1) # Feature: Whether every word is a stop word if all([ tok in utilities.stop_words for tok in phrase]): #print phrase features[ ('all_stopwords',None) ] = 1 # Feature: All Caps? (boolean) if re.search('^[^a-z]*[A-Z][A-Z][^a-z]$',''.join(phrase)): features[ ('all_caps',None) ] = 1 # Feature: All Punctuation? if re.search('^[^a-zA-Z0-9]+$',''.join(phrase)): features[ ('all_punct',None) ] = 1 # Feature: Emoticon Counts elabels = defaultdict(lambda:0) for word in norm_context: elabel = emoticons.emoticon_type(word) if elabel: elabels[elabel] += 1 for k,v in elabels.items(): featname = k + '-emoticon' features[featname] = v # Feature: Punctuation counts punct = {'!':0, '?':0, '.':0} for c in ''.join(context): if c in punct: punct[c] += 1 for k,v in punct.items(): featname = k + '-count' features[featname] = v # Features: character streaks text = ''.join(phrase) # !-streak matches = re.findall('!+',text) if matches: features['!-streak'] = max([len(w) for w in matches]) # ?-streak matches = re.findall('\\?+',text) if matches: features['?-streak'] = max([len(w) for w in matches]) # ?!-streak matches = re.findall('[!\\?]+',text) if matches: features['?!-streak'] = max([len(w) for w in matches]) # Feature: Contains elongated long word? (boolean) contains_elongated_word = False for word in phrase: if utilities.is_elongated_word(word): contains_elongated_word = True if contains_elongated_word: features[ ('contains_elongated_word',None) ] = 1 # Feature: Contains long word? (boolean) long_word_threshold = 10 contains_long_word = False for words in normalized[begin:end+1]: for word in words: if word[-4:]=='_neg': word = word[:-4] word = spell.remove_duplicates(word) if len(word) and word[0]=='#': continue word = word.strip(string.punctuation) if len(word) > long_word_threshold: contains_long_word = True if contains_long_word: features[ ('contains_long_word',None) ] = 1 return features