Exemplo n.º 1
0
def twit_tokenize(raw_twit, ticker=None, normalize=True):
    twit = raw_twit

    if normalize:
        twit = normalizeTextForTagger(twit)

    tokenized_init = tokenize(twit)
    tokenized_result = []

    for tok in tokenized_init:
        if token_is_cash_tag(tok):
            continue

        if token_is_punct(tok):
            continue

        if (ticker is not None) and (token_matches_ticker(tok, ticker)):
            continue

        if token_is_stopword(tok):
            continue

        processed_tok = process_token(tok, lowercase=True, stem=False)
        tokenized_result.append(processed_tok)

    return tokenized_result
Exemplo n.º 2
0
def get_hashtags(tweet_json):
    text = get_text_from_tweet_json(tweet_json)
    if 'entities' in tweet_json:
        return [entity['text'].lower() for entity in tweet_json['entities']['hashtags']]
    else:
        # if its an old tweet, do it the hard way
        return [x for x in set(
            [t for t in tokenize(text) if t.startswith("#") and not t == "#"])]
Exemplo n.º 3
0
def get_mentions(tweet_json, return_id=False):
    text = get_text_from_tweet_json(tweet_json)
    to_return = 'id' if return_id else 'screen_name'
    if 'entities' in tweet_json:
        return [entity[to_return] for entity in tweet_json['entities']['user_mentions'] if to_return in entity]
    else:
        # if its an old tweet, do it the hard way
        return [x for x in set(
            [t.replace("@", "") for t in tokenize(text) if t.startswith("@") and not t == "@"])]
Exemplo n.º 4
0
# Used for experiment, not actually used in pipeline
import sys,json
from nlp import twokenize

for line in sys.stdin:
    tweet = json.loads(line.split('\t')[-1])
    print u' '.join(twokenize.tokenize(tweet['text'])).encode('utf8')