def train_unsupervised_model(k=10, alpha=.1, max_iters=25, convergence_threshold=.001, baseline=False, silent=False): ''' phi is the documents-to-topics matrix ''' global base_model base_model = None D = snowball.read_data() raw_tweets = D['tweet_text'] ## 12/1 -- filter? tags = [snowball.which_tags(t) for t in raw_tweets] raw_tweets = [t for i, t in enumerate(raw_tweets) if not "#crc" in tags[i]] tokenized_tweets = [word_tokenize(tw) for tw in raw_tweets] def _seems_to_be_about_soccer(tweet): terms = [ "worldcup", "ger", "usavcrc", "fra", "italia", "mexvcrc", "#mexvcrc", "nedvscrc", "#nedvscrc", "nedcrc", "#nedcrc", "itavscrc", "#itavscrc", "uruvscrc", "#uruvscrc", "worldcup2014", "#worldcup2014", "uruguay" ] return any([t.lower() in terms for t in tweet]) indices_to_keep = [ idx for idx in range(len(tokenized_tweets)) if not _seems_to_be_about_soccer(tokenized_tweets[idx]) ] raw_tweets = [raw_tweets[idx] for idx in indices_to_keep] tokenized_tweets = [tokenized_tweets[idx] for idx in indices_to_keep] n = len(tokenized_tweets) alphas = [alpha] * k phi = np.zeros((n, k)) for i in range(n): # initialize doc rows phi[i, :] = np.random.dirichlet(alphas) # initial topic probability estimates pi = estimate_pi(phi) if not silent: print("initial assignments (random)...") print_top_tweets_for_topics(phi, raw_tweets, pi) iter_ = 0 converged = False while not converged and iter_ < max_iters: ####### # 1. update language models ####### topics_to_models = retrain_language_models(tokenized_tweets, phi, baseline=baseline) ####### # 2. re-estimate \phi ####### phi = estimate_phi(tokenized_tweets, topics_to_models, pi) pi = estimate_pi(phi) ####### # assess convergence ####### if not silent: print_top_tweets_for_topics(phi, raw_tweets, pi, n=20) cur_LL = LL(topics_to_models, pi, phi, tokenized_tweets) print("finished iter: %s; LL: %s" % (iter_, cur_LL)) #print("finished iter: %s" % iter_) print("\n") iter_ += 1 # idx0 = (-1* phi[:,0]).argsort()[:50] return raw_tweets, tokenized_tweets, phi, pi, topics_to_models
def train_unsupervised_model(k=10, alpha=.1, max_iters=25, convergence_threshold=.001, baseline=False, silent=False): ''' phi is the documents-to-topics matrix ''' global base_model base_model = None D = snowball.read_data() raw_tweets = D['tweet_text'] ## 12/1 -- filter? tags = [snowball.which_tags(t) for t in raw_tweets] raw_tweets = [t for i,t in enumerate(raw_tweets) if not "#crc" in tags[i]] tokenized_tweets = [word_tokenize(tw) for tw in raw_tweets] def _seems_to_be_about_soccer(tweet): terms = ["worldcup", "ger", "usavcrc", "fra", "italia", "mexvcrc", "#mexvcrc", "nedvscrc", "#nedvscrc", "nedcrc", "#nedcrc", "itavscrc", "#itavscrc", "uruvscrc", "#uruvscrc", "worldcup2014", "#worldcup2014", "uruguay"] return any([t.lower() in terms for t in tweet]) indices_to_keep = [idx for idx in range(len(tokenized_tweets)) if not _seems_to_be_about_soccer(tokenized_tweets[idx])] raw_tweets = [raw_tweets[idx] for idx in indices_to_keep] tokenized_tweets = [tokenized_tweets[idx] for idx in indices_to_keep] n = len(tokenized_tweets) alphas = [alpha]*k phi = np.zeros((n, k)) for i in range(n): # initialize doc rows phi[i,:] = np.random.dirichlet(alphas) # initial topic probability estimates pi = estimate_pi(phi) if not silent: print("initial assignments (random)...") print_top_tweets_for_topics(phi, raw_tweets, pi) iter_ = 0 converged = False while not converged and iter_ < max_iters: ####### # 1. update language models ####### topics_to_models = retrain_language_models(tokenized_tweets, phi, baseline=baseline) ####### # 2. re-estimate \phi ####### phi = estimate_phi(tokenized_tweets, topics_to_models, pi) pi = estimate_pi(phi) ####### # assess convergence ####### if not silent: print_top_tweets_for_topics(phi, raw_tweets, pi, n=20) cur_LL = LL(topics_to_models, pi, phi, tokenized_tweets) print("finished iter: %s; LL: %s" % (iter_, cur_LL)) #print("finished iter: %s" % iter_) print("\n") iter_ += 1 # idx0 = (-1* phi[:,0]).argsort()[:50] return raw_tweets, tokenized_tweets, phi, pi, topics_to_models
def _get_tags(tweets): tweet_tags = [snowball.which_tags(tw) for tw in tweets] unique_tag_set = list(set(itertools.chain(*tweet_tags))) return tweet_tags, unique_tag_set