示例#1
0
from sklearn.feature_extraction.text import CountVectorizer

import explore as e
import production as p

df = p.load_data()
df = p.clean_formatting(df)
df = p.remove_stopwords(df)

assert(type(df)) is pd.core.frame.DataFrame, "%r is not a DataFrame." % df
assert(df.shape) == (16526, 7), "Has the wrong shape."

vectorizer, features = p.extract_features(df, title=True)

## run model.
m = p.run_model(features, n_topics=45, random_state=0, n_iter=100)



## extract and prepare most probable documents.

def save_data_for_frontend(model, vectorizer, df):

    doc_ids = np.argsort(model.doc_topic_, axis=0)[-5:-1,:].T
    doc_probs = np.sort(model.doc_topic_, axis=0)[-5:-1,:].T
    topic_total_probs = np.sum(doc_probs, axis=1)
 
    ## extract and prepare most probable words.
    ## split bigrams and take the unique set of the resulting word list.
    w = p.most_probable_words(model, vectorizer.get_feature_names(), 10)
    word_data = collections.defaultdict(list)
def run_on_sample(features, ix, **kwargs):
    return p.run_model(features[ix, :], random_state=0, **kwargs)