from sklearn.feature_extraction.text import CountVectorizer import explore as e import production as p df = p.load_data() df = p.clean_formatting(df) df = p.remove_stopwords(df) assert(type(df)) is pd.core.frame.DataFrame, "%r is not a DataFrame." % df assert(df.shape) == (16526, 7), "Has the wrong shape." vectorizer, features = p.extract_features(df, title=True) ## run model. m = p.run_model(features, n_topics=45, random_state=0, n_iter=100) ## extract and prepare most probable documents. def save_data_for_frontend(model, vectorizer, df): doc_ids = np.argsort(model.doc_topic_, axis=0)[-5:-1,:].T doc_probs = np.sort(model.doc_topic_, axis=0)[-5:-1,:].T topic_total_probs = np.sum(doc_probs, axis=1) ## extract and prepare most probable words. ## split bigrams and take the unique set of the resulting word list. w = p.most_probable_words(model, vectorizer.get_feature_names(), 10) word_data = collections.defaultdict(list)
def run_on_sample(features, ix, **kwargs): return p.run_model(features[ix, :], random_state=0, **kwargs)