예제 #1
0
def update_terms_stats(terms_fd, json_tweet, lex):
    tweet = utils.extract_tweet_from_json(json_tweet)
    tweet_terms = []
    if tweet is None:
        return False
    tokenizer = nltk.RegexpTokenizer('\#?[\w\d]+')
    doc = tokenizer.tokenize(tweet)
    for w_raw in doc:
        w = w_raw.strip('\"\'.,;?!:)(@/*&')
        if not (w.strip('#')).isalpha():
            w_aux = ''
            #ignore non-ascii characters
            for s in w:
                if ord(s) < 128:
                    w_aux += s
                else:
                    break
            w = w_aux
        w = w.lower()
        if (w not in stopwords.words('english')
                and w not in set(['rt', 'http', 'amp'])) and len(w) in range(
                    3, 16):
            if w in lex:
                continue
            tweet_terms.append(w)
            terms_fd.inc(w)
    bigrams = nltk.bigrams(tweet_terms)
    for b in bigrams:
        if b[1] + " " + b[0] in lex or b[0] + " " + b[1] in lex:
            continue
        if b[1] + " " + b[0] in terms_fd:
            terms_fd.inc(b[1] + " " + b[0])
        else:
            terms_fd.inc(b[0] + " " + b[1])
    return True
예제 #2
0
def update_terms_stats(terms_fd, json_tweet, lex):
    tweet = utils.extract_tweet_from_json(json_tweet)
    tweet_terms = []
    if tweet is None:
        return False
    tokenizer = nltk.RegexpTokenizer("\#?[\w\d]+")
    doc = tokenizer.tokenize(tweet)
    for w_raw in doc:
        w = w_raw.strip("\"'.,;?!:)(@/*&")
        if not (w.strip("#")).isalpha():
            w_aux = ""
            # ignore non-ascii characters
            for s in w:
                if ord(s) < 128:
                    w_aux += s
                else:
                    break
            w = w_aux
        w = w.lower()
        if (w not in stopwords.words("english") and w not in set(["rt", "http", "amp"])) and len(w) in range(3, 16):
            if w in lex:
                continue
            tweet_terms.append(w)
            terms_fd.inc(w)
    bigrams = nltk.bigrams(tweet_terms)
    for b in bigrams:
        if b[1] + " " + b[0] in lex or b[0] + " " + b[1] in lex:
            continue
        if b[1] + " " + b[0] in terms_fd:
            terms_fd.inc(b[1] + " " + b[0])
        else:
            terms_fd.inc(b[0] + " " + b[1])
    return True
예제 #3
0
def update_hashtags_stats(hashtags_fd, json_tweet):
    tweet = utils.extract_tweet_from_json(json_tweet)
    tweet_terms = []
    if tweet is None or '#' not in tweet:
        return False
    tokenizer = nltk.RegexpTokenizer('\#?[\w\d]+')
    doc = tokenizer.tokenize(tweet)
    for w_raw in doc:
        if '#' not in w_raw:
            continue
        w = (w_raw.strip('\"\'.,;?!:)(@/*&')).lower()
        tweet_terms.append(w)
        hashtags_fd.inc(w)
    return True
예제 #4
0
def update_hashtags_stats(hashtags_fd, json_tweet):
    tweet = utils.extract_tweet_from_json(json_tweet)
    tweet_terms = []
    if tweet is None or "#" not in tweet:
        return False
    tokenizer = nltk.RegexpTokenizer("\#?[\w\d]+")
    doc = tokenizer.tokenize(tweet)
    for w_raw in doc:
        if "#" not in w_raw:
            continue
        w = (w_raw.strip("\"'.,;?!:)(@/*&")).lower()
        tweet_terms.append(w)
        hashtags_fd.inc(w)
    return True