Exemplo n.º 1
0
def featurize_tweet(t):
    toks = tokenize(t.lower())
    norm_toks = [norm_token(t) for t in toks]
    norm_toks = [t for t in norm_toks if t]

    feats = list(set(norm_toks))

    return feats
Exemplo n.º 2
0
def normalize_tweet(tweet):
    '''
    Break tweet into normalized tokens and append _NEG or _NEGFIRST tags
    to tokens that appear in negated context since we are using a
    contextual lexicon.  Returns list of normalized tokens.
    '''

    tokens = [norm_token(t) for t in tokenize(tweet.lower())]
    return [t for t in tokens if t]
Exemplo n.º 3
0
    def apply(self, input):
        tokens = tokenize(input.lower())

        if self.stop_tokens:
            tokens = [t for t in tokens if t not in self.stop_token_list]

        if self.usernames:
            tokens = [t for t in tokens if not t.startswith('@')]

        if self.urls:
            tokens = [t for t in tokens if not re.match(self.url_regex, t)]

        if self.punctuation:
            tokens = [t for t in tokens if t.translate(self.punctuation_map)]

        return tokens
Exemplo n.º 4
0
    def classify(self, tweet, results=None):
        """
        Takes in a string representing the entire text of a tweet, and uses the internal models
        to calculate the probability of that tweet belonging every 'seen' hashtag.

        If results is none, ALL hashtags and their associated probabilities will be returned.
        Otherwise, if results in an int, then that number of hashtags with the highest
        probabilities with be returned.
        """
        # Twokenize tweet
        tweet_tokens = tokenize(tweet)

        # Filter out any existing hashtags
        tokens = set()
        for token in tweet_tokens:
            if token[0] != '#':
                tokens.add(unicode(token).lower())

        probs = {}

        for hashtag in self._hashtags():
            probs[hashtag] = self._prob(tokens, hashtag)

        return sorted(probs.iteritems(), key=lambda t: t[1], reverse=True)[:results]