def TokenizeTweets(data_path): learner_tweet_map = {} tok = Tokenizer(preserve_case=False) learners = os.listdir(data_path) for learner in learners: tweet_path = data_path + learner + "/tweet" if os.path.isfile(tweet_path): tweet_file = open(tweet_path, "r") lines = tweet_file.readlines() individual_word_count_map = {} individual_word_count_set = set() num_english_tweet = 0 for line in lines: try: jsonObject = json.loads(line) if jsonObject["lang"] == "en": tweet = jsonObject["text"] tokenized_tweet = tok.tokenize(tweet) num_english_tweet += 1 for word in tokenized_tweet: if word not in individual_word_count_set: individual_word_count_set.add(word) individual_word_count_map[word] = 0 individual_word_count_map[word] += 1 except Exception as e: print line learner_tweet_map[learner] = {} learner_tweet_map[learner]["tweet"] = individual_word_count_map learner_tweet_map[learner]["num_english_tweet"] = num_english_tweet #if num > 30: # break if len(learner_tweet_map) % 100 == 0: print len(learner_tweet_map) output_path = os.path.dirname(os.path.dirname(data_path)) + "all_tokenized_tweets" output_file = open(output_path, "w") output_file.write(json.dumps(learner_tweet_map)) output_file.close()
def tokenize_tweet(tweet, tok=None): """ Tokenize the tweet and discard any with fewer than 3 tokens """ if not tok: tok = Tokenizer() tweet = tweet.strip() tokens = tok.tokenize(tweet) if len(tokens) > 3: tweet = " ".join(tokens) else: tweet = "" return tweet
def process_tweets(tweets, target): tok = Tokenizer() processed_tweets = [] for tweet in tweets: tweet = clean_tweet(tweet, target) tweet = remove_self_refs(tweet, target) tweet = tokenize_tweet(tweet, tok) if tweet: processed_tweets.append(tweet) print "Remaining Tweets:", len(processed_tweets) return processed_tweets
class FeatureWorker(TextWorker): def __init__(self): super(FeatureWorker, self).__init__() self.tok = Tokenizer() def extractNgramPerTweet(self, tweet, n=1): """ Extract n-grams from tweet after standardizing """ tweet = self.shrinkSpace(tweet) tweet = self.remove_handles(tweet) tweet = self.remove_urls(tweet) tokens = self.tok.tokenize(tweet) #ngrams = Counter([" ".join(x) for x in zip(*[tokens[i:] for i in range(n)])]) ngrams = Counter([" ".join(x) for x in zip(*[tokens[n:]])]) return ngrams def fullNGramExtract(self, tweet_list, n=1): """ """ all_ngrams = Counter() for i in range(n): this_ngrams = Counter() for tweet in tweet_list: this_ngrams.update(self.extractNgramPerTweet(tweet, n)) total_ngrams = float(sum(this_ngrams.values())) all_ngrams.update({ gram: value / total_ngrams for gram, value in this_ngrams.items() }) return all_ngrams def extractLexicon(self, ngrams, lex, intercepts=None): """ """ pLex = {} # prob of lex given user for term, cats in lex.iteritems(): try: gn = ngrams[term] for cat, weight in cats.iteritems(): try: pLex[cat] += float(gn) * weight except KeyError: pLex[cat] = float(gn) * weight except KeyError: pass #not in lex if intercepts: for cat in intercepts: pLex[cat] += intercepts[cat] return pLex
def extract_features(tweet, uid): if not config.tokenizer: config.tokenizer = Tokenizer() tokens = config.tokenizer.tokenize(tweet) feats = [] # Character ngram features for n in range(2, 6): feats += get_char_ngram_feats(tweet, n) # Word ngram features for n in range(1, 4): feats += get_word_ngram_feats(tokens, n) feats += apply_user_factor_adaptation(feats, uid) return feats
def __init__(self): super(FeatureWorker, self).__init__() self.tok = Tokenizer()
# # print file # #print file, len(dict[user_id]) # # def run_proc2(file, q): # soup = BeautifulSoup(open(file),'xml') # posts = soup.find_all('post') # for post in posts: # tokenized = tok.tokenize(post) # for token in tokenized: # if token in user_word_count[user_id]: # dict[user_id][token] += 1 # else: # dict[user_id][token] = 1 # #print file tok = Tokenizer(preserve_case=False) files_list = glob.glob(sys.argv[1] + '/*.xml') user_word_count = {} posts_count = 0 words_count = 0 industries = {} ages = {} genders = {} user_industry_map = {} sample_size = 0 user_id_re = re.compile(r'(?<=/)(\d{3,8})(?=.)') user_age_re = re.compile(r'(?<=\.)(\d{2})(?=\.)')
wordsForTopic = dict() with open(topic_count_file, 'rb') as csvfile: topicreader = csv.reader(csvfile) topicreader.next() #throw out header for row in topicreader: topic = int(row[0]) words = row[1:][::2][:4] wordsForTopic[topic] = words print "[DONE]" ################################ #1. READ AND TOKENIZE THE CORPUS #READ CORPUS dirFiles = os.listdir(blogsDir) print "[LOADING CORPUS (%d files) AND CALCULATING TOPIC USAGE]" % len(dirFiles) tkzer = Tokenizer() postsRe = re.compile(r'<post>(.*?)</post>', re.DOTALL + re.I) #.*? = non-geedy match userData = dict() #dictionary of user_id => {age, gender, industry, topics} filesRead = 0 (numPosts, numWords, industries) = (0, 0, dict()) #for answering question one. for file in dirFiles: if fnmatch.fnmatch(file, '*.xml'): user_id, gender, age, industry, zodiac = file.split('.')[:5] industry = industry.lower() wordCounts = dict() totalWords = 0 currentFile = open(blogsDir + '/' + file).read() posts = postsRe.findall(currentFile) for post in posts: words = tkzer.tokenize(post)
with open(topic_count_file, 'rb') as csvfile: topicreader = csv.reader(csvfile) topicreader.next()#throw out header for row in topicreader: topic = int(row[0]) words = row[1:][::2][:4] wordsForTopic[topic] = words print "[DONE]" ################################ #1. READ AND TOKENIZE THE CORPUS #READ CORPUS dirFiles = os.listdir(blogsDir) print "[LOADING CORPUS (%d files) AND CALCULATING TOPIC USAGE]" % len(dirFiles) tkzer = Tokenizer() postsRe = re.compile(r'<post>(.*?)</post>', re.DOTALL + re.I) #.*? = non-geedy match userData = dict() #dictionary of user_id => {age, gender, industry, topics} filesRead = 0 (numPosts, numWords, industries) = (0, 0, dict()) #for answering question one. for file in dirFiles: if fnmatch.fnmatch(file, '*.xml'): user_id, gender, age, industry, zodiac = file.split('.')[:5] industry = industry.lower() wordCounts = dict() totalWords = 0 currentFile = open(blogsDir+'/'+file).read() posts = postsRe.findall(currentFile) for post in posts: words = tkzer.tokenize(post) for word in words:
# #print file, len(dict[user_id]) # # def run_proc2(file, q): # soup = BeautifulSoup(open(file),'xml') # posts = soup.find_all('post') # for post in posts: # tokenized = tok.tokenize(post) # for token in tokenized: # if token in user_word_count[user_id]: # dict[user_id][token] += 1 # else: # dict[user_id][token] = 1 # #print file tok = Tokenizer(preserve_case=False) files_list = glob.glob(sys.argv[1]+'/*.xml') user_word_count = {} posts_count = 0 words_count = 0 industries = {} ages = {} genders = {} user_industry_map = {} sample_size = 0 user_id_re = re.compile(r'(?<=/)(\d{3,8})(?=.)') user_age_re = re.compile(r'(?<=\.)(\d{2})(?=\.)')
def parse_blogs(path): tokenizer = Tokenizer() users = {} global_words_dict = {} industry_map = {} total_users = 0 total_blog_posts = 0 iterations = 0 topics = pd.read_csv('wwbpFBtopics_condProb.csv') regex = r'<post>(.*?)</post>' for filename in os.listdir(path): iterations += 1 print "user %d" %iterations if iterations > 50: break if filename.startswith("."): continue parts = filename.split(".") user_attributes_map = {} word_count_map = {} topic_prob_map = {} user_total_words_count = 0 user_id = (int)(parts[0]) gender = parts[1] if gender == "male": gender = 0 else: gender = 1 age = (int)(parts[2]) industry = parts[3] star_sign = parts[4] if user_id in users: user_attributes_map = users[user_id] if industry in industry_map: industry_map[industry] = industry_map[industry] + 1 else: industry_map[industry] = 1 with open(path + filename, 'r') as user_blog: user_blogs = user_blog.read().replace('\n', '').replace('\r', '').replace('\t', '') all_blog_posts = re.findall(regex, user_blogs, re.DOTALL) total_blog_posts = total_blog_posts + len(all_blog_posts) for blog in all_blog_posts: words = tokenizer.tokenize(blog.strip()) user_total_words_count = user_total_words_count + len(words) if 'wc_map' in user_attributes_map: word_count_map = user_attributes_map['wc_map'] for word in words: if word in word_count_map: count = word_count_map[word] count = count + 1 word_count_map[word] = count else: word_count_map[word] = 1 if word in global_words_dict: count = global_words_dict[word] count = count + 1 global_words_dict[word] = count else: global_words_dict[word] = 1 for topic in range(2000): prob_topic_given_user = 0.0 topic_dict = topics[topics['category'] == topic] for row in topic_dict.itertuples(): word = row[1] prob_topic_given_word = row[3] if word in word_count_map: count_user_word = word_count_map[word] prob_word_given_user = count_user_word/user_total_words_count cur = prob_topic_given_word * prob_word_given_user prob_topic_given_user = prob_topic_given_user + cur topic_prob_map[topic] = prob_topic_given_user user_attributes_map['wc_map'] = word_count_map user_attributes_map['age'] = age user_attributes_map['industry'] = industry user_attributes_map['star_sign'] = star_sign user_attributes_map['user_id'] = user_id user_attributes_map['topic_prob_map'] = topic_prob_map user_attributes_map['total_count'] = user_total_words_count user_attributes_map['gender'] = gender users[user_id] = user_attributes_map return (users, global_words_dict, industry_map, total_blog_posts)
def __init__(self, delimiter=" "): # delimiter to split tweets into tokens self.DELIM = delimiter self.tokenizer = Tokenizer()
class Preprocessor: def __init__(self, delimiter=" "): # delimiter to split tweets into tokens self.DELIM = delimiter self.tokenizer = Tokenizer() def tokenize(self, tweet): return " ".join(self.tokenizer.tokenize(tweet)) def replace_user_tags(self, tweet, remove=False): """ Replace mentions to usernames with "@USER" if remove=True removes the user mentions >>> p=Preprocessor() >>> p.replace_user_tags("@maya yes this is cool1@ did b@ @augyyz") '@USER yes this is cool1@ did b@ @USER' >>> p.replace_user_tags("@maya yes this is cool1@ did b@ @augyyz",remove=True) 'yes this is cool1@ did b@' """ if remove: return self.DELIM.join( [w for w in tweet.split(self.DELIM) if not w.startswith("@")]) else: return self.DELIM.join([ "@USER" if w.startswith("@") else w for w in tweet.split(self.DELIM) ]) def replace_urls(self, tweet, remove=False): """ Replace urls with @URL if remove=True removes them >>> p=Preprocessor() >>> p.replace_urls("@maya yes this is cool1@ did b@ @augyyz http://www.bitly") '@maya yes this is cool1@ did b@ @augyyz @URL' >>> p.replace_urls("@maya yes this is cool1@ did b@ @augyyz http://www.bitly",remove=True) '@maya yes this is cool1@ did b@ @augyyz' """ if remove: return self.DELIM.join([ w for w in tweet.split(self.DELIM) if not w.startswith("http") ]) else: return self.DELIM.join([ "@URL" if w.startswith("http") else w for w in tweet.split(self.DELIM) ]) def replace_hashtags(self, tweet, remove=False): """ Replace hashtags with @HASHTAG if remove=True removes them (any number of # at token start) >>> p=Preprocessor() >>> p.replace_hashtags("yes #cool we are in #miami ###yes") 'yes @HASHTAG we are in @HASHTAG @HASHTAG' >>> p.replace_hashtags("yes #cool we# are in #miami ###yes",remove=True) 'yes we# are in' >>> p.replace_hashtags("yes #cool we# are in #miami ###yes bar . #wishiwere in italy .") 'yes @HASHTAG we# are in @HASHTAG @HASHTAG bar . @HASHTAG in italy .' """ if remove: return self.DELIM.join( [w for w in tweet.split(self.DELIM) if not w.startswith("#")]) else: return self.DELIM.join([ "@HASHTAG" if w.startswith("#") else w for w in tweet.split(self.DELIM) ])