def extract_keywords(sentence): '''Extracts hashtags and keywords from a tweet, stores them in a neat little list of tuples of keyword and a confidence factor of some sort (currently hard-coded to 1.0. But might change in future, or might not and just be really stupid). TODO: perhaps try to filter hashtags from the explicit_keywords and extract_keywords_grammar stuff in the same manner as we don't count names doubly. ''' def concat(*a): return reduce(operator.add, a, []) stripped = strip_tweet(sentence) names = set(get_names(sentence)) # made into sets to speed up the filtering below hashtags = get_hashtags(sentence) return concat(filter(lambda (a,_1): a not in names, map(lambda (a,b): (a.lower(), b), concat(explicit_keywords(map(non_aggresive_stemmer, nltk.word_tokenize(stripped))), map(non_aggresive_stemmer, filter_keywords(extract_keywords_grammar(stripped), key = lambda a: a[0]))))), map(lambda x: (x.lower(), 5.0), hashtags), map(lambda x: (x.lower(), 1.6), names))
def extract_features(text): sequence = nltk.pos_tag(nltk.word_tokenize(text)) text = strip_tweet(text) grammar='''Adjective: {<RBR>*(<JJ>|<JJS>|<JJT>|<JJR>)+} RbVerb: {(<RB>*(<VBN>|<VB>|<VBP>|<VBG>))+}''' chunks = nltk.RegexpParser(grammar) feat = [] #print chunks.parse(sequence) for t in chunks.parse(sequence).subtrees(): if t.node == "Adjective": if len(t)>1: line = reduce(lambda x,y: x + " " + y, map(lambda (x,_1): x, t)) feat.append(line) else: feat.append(t[0][0]) elif t.node == "RbVerb": if len(t)>1: line = reduce(lambda x,y: x + " " + y, map(lambda (x,_1): x, t)) line = line.replace("n't","not") line = line.replace("'m", "am") feat.append(line) else: feat.append(t[0][0]) return list(set(feat))