def init_upvote(subreddit, feature, ngram, lda, reducer, learner, dim): """ initializes the models for predicting upvotes """ global UP_DF, UP_FEATURE, UP_REDUCER, UP_LEARNER, UP_DIM print "subreddit file:", subreddit UP_DF = load_subreddit(subreddit) name = UP_DF["subreddit"][0] print "subreddit:", name print "num rows:", len(UP_DF.index) print "max up:", features.denormalize_scores([1.], name) if feature == "ngram": UP_FEATURE = features.NGramModel(ngram) elif feature == "lda": UP_FEATURE = features.LdaFeatureModel(lda) UP_REDUCER = REDUCERS[reducer](dim) UP_LEARNER = LEARNERS[learner]() UP_DIM = dim print "training models..." train_upvotes(UP_DF, UP_FEATURE, UP_REDUCER, UP_LEARNER) print "upvote init done"
def main(subreddit, comments, model_name, reducer_name, learner_name, dim, folds, clusters): model = FEATURES[model_name]() if model_name == "lda": reducer_name = "none" reducer = REDUCERS[reducer_name](dim) learner = LEARNERS[learner_name]() print "model: %s, reducer: %s, learner: %s, reduced dim: %d" \ % (model_name, reducer_name, learner_name, dim) print "opening subreddit file:", subreddit df = load_subreddit(subreddit) subreddit_name = df["subreddit"][0] print "subreddit:", subreddit_name print "num rows:", len(df.index) print "max upvotes:", features.denormalize_scores([1.], subreddit_name) if folds > 0: print ">>>>> cross validating with %d folds" % folds test_performance(df, model_name, learner_name, reducer_name, folds, dim) print ">>>>>" # don't bother to produce the training set or reduce dimensionality # if we are not providede with a test file or cluster numbers if comments == "" and clusters == 0: return print ">>>>>>" print "making training data..." X_train, Y_train = model.make_training_xy(df) print "done" print "reducing dimensionality..." reducer.fit(X_train, Y_train) X_train_red = reducer.transform(X_train) print "done" if comments != "": print "training learner..." learner.train(X_train_red, Y_train) print "done" print "getting test data from %s ..." % comments testfile = open(comments, "rb") testdata = testfile.readlines() testdata = [line.strip() for line in testdata] testfile.close() new_df = pd.DataFrame({ 'body': testdata, 'subreddit': [subreddit_name] * len(testdata) }) X_test = model.data_to_x(new_df) X_test_red = reducer.transform(X_test) print "done" print "predicting test labels..." Y_test = learner.predict(X_test_red) Y_upvotes = model.y_to_label(df, Y_test) print "done" print "" print ">>>>> RESULTS" for comment, upvote in zip(testdata, Y_upvotes): print upvote, comment print ">>>>>" print if clusters > 0: print ">>>>> CLUSTERING with %d clusters" % clusters unsupervised.cluster_within_subreddit(df, X_train_red, clusters)
def main(subreddit, comments, model_name, reducer_name, learner_name, dim, folds, clusters): model = FEATURES[model_name]() if model_name == "lda": reducer_name = "none" reducer = REDUCERS[reducer_name](dim) learner = LEARNERS[learner_name]() print "model: %s, reducer: %s, learner: %s, reduced dim: %d" % (model_name, reducer_name, learner_name, dim) print "opening subreddit file:", subreddit df = load_subreddit(subreddit) subreddit_name = df["subreddit"][0] print "subreddit:", subreddit_name print "num rows:", len(df.index) print "max upvotes:", features.denormalize_scores([1.0], subreddit_name) if folds > 0: print ">>>>> cross validating with %d folds" % folds test_performance(df, model_name, learner_name, reducer_name, folds, dim) print ">>>>>" # don't bother to produce the training set or reduce dimensionality # if we are not providede with a test file or cluster numbers if comments == "" and clusters == 0: return print ">>>>>>" print "making training data..." X_train, Y_train = model.make_training_xy(df) print "done" print "reducing dimensionality..." reducer.fit(X_train, Y_train) X_train_red = reducer.transform(X_train) print "done" if comments != "": print "training learner..." learner.train(X_train_red, Y_train) print "done" print "getting test data from %s ..." % comments testfile = open(comments, "rb") testdata = testfile.readlines() testdata = [line.strip() for line in testdata] testfile.close() new_df = pd.DataFrame({"body": testdata, "subreddit": [subreddit_name] * len(testdata)}) X_test = model.data_to_x(new_df) X_test_red = reducer.transform(X_test) print "done" print "predicting test labels..." Y_test = learner.predict(X_test_red) Y_upvotes = model.y_to_label(df, Y_test) print "done" print "" print ">>>>> RESULTS" for comment, upvote in zip(testdata, Y_upvotes): print upvote, comment print ">>>>>" print if clusters > 0: print ">>>>> CLUSTERING with %d clusters" % clusters unsupervised.cluster_within_subreddit(df, X_train_red, clusters)