예제 #1
0
def extract_keywords(sentence):
    '''Extracts hashtags and keywords from a tweet, stores them in a
    neat little list of tuples of keyword and a confidence factor of
    some sort (currently hard-coded to 1.0. But might change in
    future, or might not and just be really stupid).

    TODO: perhaps try to filter hashtags from the explicit_keywords
          and extract_keywords_grammar stuff in the same manner as we
          don't count names doubly.
    '''
    
    def concat(*a):
        return reduce(operator.add, a, [])

    stripped = strip_tweet(sentence)

    names    = set(get_names(sentence)) # made into sets to speed up the filtering below
    hashtags = get_hashtags(sentence)
                    
    return concat(filter(lambda (a,_1): a not in names,
                         map(lambda (a,b): (a.lower(), b),
                             concat(explicit_keywords(map(non_aggresive_stemmer, nltk.word_tokenize(stripped))),
                                    map(non_aggresive_stemmer, filter_keywords(extract_keywords_grammar(stripped),
                                                                               key = lambda a: a[0]))))),
                  map(lambda x: (x.lower(), 5.0), hashtags),
                  map(lambda x: (x.lower(), 1.6), names))
예제 #2
0
def extract_features(text):
    sequence = nltk.pos_tag(nltk.word_tokenize(text))
    text = strip_tweet(text)
    grammar='''Adjective: {<RBR>*(<JJ>|<JJS>|<JJT>|<JJR>)+}
               RbVerb: {(<RB>*(<VBN>|<VB>|<VBP>|<VBG>))+}'''
    chunks = nltk.RegexpParser(grammar)
    feat = []
    #print chunks.parse(sequence)
    for t in chunks.parse(sequence).subtrees():
        if t.node == "Adjective":
            if len(t)>1:
                line = reduce(lambda x,y: x + " " + y, map(lambda (x,_1): x, t))
                feat.append(line)
            else:
                feat.append(t[0][0])  
        elif t.node == "RbVerb":
            if len(t)>1:
                line = reduce(lambda x,y: x + " " + y, map(lambda (x,_1): x, t))
                line = line.replace("n't","not")
                line = line.replace("'m", "am")
                feat.append(line)
            else:
                feat.append(t[0][0])
            
    return list(set(feat))