def update_terms_stats(terms_fd, json_tweet, lex): tweet = utils.extract_tweet_from_json(json_tweet) tweet_terms = [] if tweet is None: return False tokenizer = nltk.RegexpTokenizer('\#?[\w\d]+') doc = tokenizer.tokenize(tweet) for w_raw in doc: w = w_raw.strip('\"\'.,;?!:)(@/*&') if not (w.strip('#')).isalpha(): w_aux = '' #ignore non-ascii characters for s in w: if ord(s) < 128: w_aux += s else: break w = w_aux w = w.lower() if (w not in stopwords.words('english') and w not in set(['rt', 'http', 'amp'])) and len(w) in range( 3, 16): if w in lex: continue tweet_terms.append(w) terms_fd.inc(w) bigrams = nltk.bigrams(tweet_terms) for b in bigrams: if b[1] + " " + b[0] in lex or b[0] + " " + b[1] in lex: continue if b[1] + " " + b[0] in terms_fd: terms_fd.inc(b[1] + " " + b[0]) else: terms_fd.inc(b[0] + " " + b[1]) return True
def update_terms_stats(terms_fd, json_tweet, lex): tweet = utils.extract_tweet_from_json(json_tweet) tweet_terms = [] if tweet is None: return False tokenizer = nltk.RegexpTokenizer("\#?[\w\d]+") doc = tokenizer.tokenize(tweet) for w_raw in doc: w = w_raw.strip("\"'.,;?!:)(@/*&") if not (w.strip("#")).isalpha(): w_aux = "" # ignore non-ascii characters for s in w: if ord(s) < 128: w_aux += s else: break w = w_aux w = w.lower() if (w not in stopwords.words("english") and w not in set(["rt", "http", "amp"])) and len(w) in range(3, 16): if w in lex: continue tweet_terms.append(w) terms_fd.inc(w) bigrams = nltk.bigrams(tweet_terms) for b in bigrams: if b[1] + " " + b[0] in lex or b[0] + " " + b[1] in lex: continue if b[1] + " " + b[0] in terms_fd: terms_fd.inc(b[1] + " " + b[0]) else: terms_fd.inc(b[0] + " " + b[1]) return True
def update_hashtags_stats(hashtags_fd, json_tweet): tweet = utils.extract_tweet_from_json(json_tweet) tweet_terms = [] if tweet is None or '#' not in tweet: return False tokenizer = nltk.RegexpTokenizer('\#?[\w\d]+') doc = tokenizer.tokenize(tweet) for w_raw in doc: if '#' not in w_raw: continue w = (w_raw.strip('\"\'.,;?!:)(@/*&')).lower() tweet_terms.append(w) hashtags_fd.inc(w) return True
def update_hashtags_stats(hashtags_fd, json_tweet): tweet = utils.extract_tweet_from_json(json_tweet) tweet_terms = [] if tweet is None or "#" not in tweet: return False tokenizer = nltk.RegexpTokenizer("\#?[\w\d]+") doc = tokenizer.tokenize(tweet) for w_raw in doc: if "#" not in w_raw: continue w = (w_raw.strip("\"'.,;?!:)(@/*&")).lower() tweet_terms.append(w) hashtags_fd.inc(w) return True