def train_supervised_model(): D = snowball.read_data() raw_tweets = D['tweet'] tokenized_tweets = [word_tokenize(tw) for tw in raw_tweets] # as a first pass, we'll tree these as labels. tags, tag_set = _get_tags(raw_tweets) base_model = Word2Vec() #workers=multiprocessing.cpu_count(), # iter=3) # initialize a shared vocab base_model.build_vocab(tokenized_tweets) n_train = int(1 * len(tags)) tweets_train, tweets_test = tokenized_tweets[:n_train], tokenized_tweets[ n_train:] tags_train, tags_test = tags[:n_train], tags[n_train:] tags_to_models = {} for tag in tag_set: tweets_for_tag = _tweets_with_tags(tweets_train, tags_train, [tag]) # train up model for this tag m = copy.deepcopy(base_model) print("training language model for tag %s with %s examples..." % (tag, len(tweets_for_tag))) m.train(tweets_for_tag) print("ok.") tags_to_models[tag] = m return tags_to_models '''
def train_supervised_model(): D = snowball.read_data() raw_tweets = D['tweet'] tokenized_tweets = [word_tokenize(tw) for tw in raw_tweets] # as a first pass, we'll tree these as labels. tags, tag_set = _get_tags(raw_tweets) base_model = Word2Vec() #workers=multiprocessing.cpu_count(), # iter=3) # initialize a shared vocab base_model.build_vocab(tokenized_tweets) n_train = int(1 * len(tags)) tweets_train, tweets_test = tokenized_tweets[:n_train], tokenized_tweets[n_train:] tags_train, tags_test = tags[:n_train], tags[n_train:] tags_to_models = {} for tag in tag_set: tweets_for_tag = _tweets_with_tags(tweets_train, tags_train, [tag]) # train up model for this tag m = copy.deepcopy(base_model) print("training language model for tag %s with %s examples..." % (tag, len(tweets_for_tag))) m.train(tweets_for_tag) print("ok.") tags_to_models[tag] = m return tags_to_models '''
def train_unsupervised_model(k=10, alpha=.1, max_iters=25, convergence_threshold=.001, baseline=False, silent=False): ''' phi is the documents-to-topics matrix ''' global base_model base_model = None D = snowball.read_data() raw_tweets = D['tweet_text'] ## 12/1 -- filter? tags = [snowball.which_tags(t) for t in raw_tweets] raw_tweets = [t for i, t in enumerate(raw_tweets) if not "#crc" in tags[i]] tokenized_tweets = [word_tokenize(tw) for tw in raw_tweets] def _seems_to_be_about_soccer(tweet): terms = [ "worldcup", "ger", "usavcrc", "fra", "italia", "mexvcrc", "#mexvcrc", "nedvscrc", "#nedvscrc", "nedcrc", "#nedcrc", "itavscrc", "#itavscrc", "uruvscrc", "#uruvscrc", "worldcup2014", "#worldcup2014", "uruguay" ] return any([t.lower() in terms for t in tweet]) indices_to_keep = [ idx for idx in range(len(tokenized_tweets)) if not _seems_to_be_about_soccer(tokenized_tweets[idx]) ] raw_tweets = [raw_tweets[idx] for idx in indices_to_keep] tokenized_tweets = [tokenized_tweets[idx] for idx in indices_to_keep] n = len(tokenized_tweets) alphas = [alpha] * k phi = np.zeros((n, k)) for i in range(n): # initialize doc rows phi[i, :] = np.random.dirichlet(alphas) # initial topic probability estimates pi = estimate_pi(phi) if not silent: print("initial assignments (random)...") print_top_tweets_for_topics(phi, raw_tweets, pi) iter_ = 0 converged = False while not converged and iter_ < max_iters: ####### # 1. update language models ####### topics_to_models = retrain_language_models(tokenized_tweets, phi, baseline=baseline) ####### # 2. re-estimate \phi ####### phi = estimate_phi(tokenized_tweets, topics_to_models, pi) pi = estimate_pi(phi) ####### # assess convergence ####### if not silent: print_top_tweets_for_topics(phi, raw_tweets, pi, n=20) cur_LL = LL(topics_to_models, pi, phi, tokenized_tweets) print("finished iter: %s; LL: %s" % (iter_, cur_LL)) #print("finished iter: %s" % iter_) print("\n") iter_ += 1 # idx0 = (-1* phi[:,0]).argsort()[:50] return raw_tweets, tokenized_tweets, phi, pi, topics_to_models
def train_unsupervised_model(k=10, alpha=.1, max_iters=25, convergence_threshold=.001, baseline=False, silent=False): ''' phi is the documents-to-topics matrix ''' global base_model base_model = None D = snowball.read_data() raw_tweets = D['tweet_text'] ## 12/1 -- filter? tags = [snowball.which_tags(t) for t in raw_tweets] raw_tweets = [t for i,t in enumerate(raw_tweets) if not "#crc" in tags[i]] tokenized_tweets = [word_tokenize(tw) for tw in raw_tweets] def _seems_to_be_about_soccer(tweet): terms = ["worldcup", "ger", "usavcrc", "fra", "italia", "mexvcrc", "#mexvcrc", "nedvscrc", "#nedvscrc", "nedcrc", "#nedcrc", "itavscrc", "#itavscrc", "uruvscrc", "#uruvscrc", "worldcup2014", "#worldcup2014", "uruguay"] return any([t.lower() in terms for t in tweet]) indices_to_keep = [idx for idx in range(len(tokenized_tweets)) if not _seems_to_be_about_soccer(tokenized_tweets[idx])] raw_tweets = [raw_tweets[idx] for idx in indices_to_keep] tokenized_tweets = [tokenized_tweets[idx] for idx in indices_to_keep] n = len(tokenized_tweets) alphas = [alpha]*k phi = np.zeros((n, k)) for i in range(n): # initialize doc rows phi[i,:] = np.random.dirichlet(alphas) # initial topic probability estimates pi = estimate_pi(phi) if not silent: print("initial assignments (random)...") print_top_tweets_for_topics(phi, raw_tweets, pi) iter_ = 0 converged = False while not converged and iter_ < max_iters: ####### # 1. update language models ####### topics_to_models = retrain_language_models(tokenized_tweets, phi, baseline=baseline) ####### # 2. re-estimate \phi ####### phi = estimate_phi(tokenized_tweets, topics_to_models, pi) pi = estimate_pi(phi) ####### # assess convergence ####### if not silent: print_top_tweets_for_topics(phi, raw_tweets, pi, n=20) cur_LL = LL(topics_to_models, pi, phi, tokenized_tweets) print("finished iter: %s; LL: %s" % (iter_, cur_LL)) #print("finished iter: %s" % iter_) print("\n") iter_ += 1 # idx0 = (-1* phi[:,0]).argsort()[:50] return raw_tweets, tokenized_tweets, phi, pi, topics_to_models