def find_reconstruction_errors(tfidf, min_features=1, max_features=20, verbose=False): """ DESCR: calculate all reconstruction errors for given tfidf matrix INPUT: tfidf - tfidf for posts. min_features - start of plot range max_features - end of plot range verbose - whether or not to print output to screen OUTPUT: recon_errors - list of reconstruction error (frobenius norm) features - list of features numbers """ # Empty list of reconstsruction errors recon_errors = [] # Range to generate nmf over features = range(min_features, max_features + 1) # Calculate rcon err for all given features and add to list for feature in features: if verbose: print " Running NMF on tfidf for {} features".format(feature) W_matrix, nmf = nmf_on_posts(tfidf, latent_features=feature) recon_errors.append(nmf.reconstruction_err_) return recon_errors, features
if __name__ == "__main__": try: nmf_features = int(sys.argv[1]) except: nmf_features = 30 # Load all pickled objest, post_df, post_tfidf, and tfidf print "Loading post df..." post_df = pd.read_pickle(POST_DF_PICKLE) tfidf_matrix = pickle.load(open(POST_TFIDF_MATRIX_PICKLE, "rb")) tfidf = pickle.load(open(POST_TFIDF_PICKLE, "rb")) # Perfrom nmf on tfidf to reduce dimensionality of the dataframe for models # W matrix is a: post X latent_features matrix print "Reducing dimension of tfidf matrix to {} features".format(nmf_features) W_matrix, nmf = nmf_on_posts(tfidf_matrix, latent_features=nmf_features) # Pickle nmf for future use pickle.dump(nmf, open(POST_NMF_PICKLE, "wb")) pickle.dump(W_matrix, open(W_MATRIX_PICKLE, "wb")) print " nmf object pickled to {}".format(POST_NMF_PICKLE) print " nmf w matrix pickled to {}".format(W_MATRIX_PICKLE) # Merge nmf matrix and post dataframe, pickle for later use post_df = merge_W_with_post_df(W_matrix, post_df) post_df = add_main_topic_col(W_matrix, post_df) post_df = content_to_num_words_and_drop(post_df) post_df.to_pickle(POST_DF_W_NMF_PICKLE) print " post df with nmf pickled to {}".format(POST_DF_W_NMF_PICKLE)