Пример #1
0
def init_upvote(subreddit, feature, ngram, lda, reducer, learner, dim):
    """
    initializes the models for predicting upvotes
    """
    global UP_DF, UP_FEATURE, UP_REDUCER, UP_LEARNER, UP_DIM

    print "subreddit file:", subreddit
    UP_DF = load_subreddit(subreddit)
    name = UP_DF["subreddit"][0]
    print "subreddit:", name
    print "num rows:", len(UP_DF.index)
    print "max up:", features.denormalize_scores([1.], name)

    if feature == "ngram":
        UP_FEATURE = features.NGramModel(ngram)
    elif feature == "lda":
        UP_FEATURE = features.LdaFeatureModel(lda)
    UP_REDUCER = REDUCERS[reducer](dim)
    UP_LEARNER = LEARNERS[learner]()
    UP_DIM = dim

    print "training models..."
    train_upvotes(UP_DF, UP_FEATURE, UP_REDUCER, UP_LEARNER)
    print "upvote init done"
Пример #2
0
def init_upvote(subreddit, feature, ngram, lda, reducer, learner, dim):
    """
    initializes the models for predicting upvotes
    """
    global UP_DF, UP_FEATURE, UP_REDUCER, UP_LEARNER, UP_DIM

    print "subreddit file:", subreddit
    UP_DF = load_subreddit(subreddit)
    name = UP_DF["subreddit"][0]
    print "subreddit:", name
    print "num rows:", len(UP_DF.index)
    print "max up:", features.denormalize_scores([1.], name)

    if feature == "ngram":
        UP_FEATURE = features.NGramModel(ngram)
    elif feature == "lda":
        UP_FEATURE = features.LdaFeatureModel(lda)
    UP_REDUCER = REDUCERS[reducer](dim)
    UP_LEARNER = LEARNERS[learner]()
    UP_DIM = dim

    print "training models..."
    train_upvotes(UP_DF, UP_FEATURE, UP_REDUCER, UP_LEARNER)
    print "upvote init done"
Пример #3
0
def main(subreddit, comments, model_name, reducer_name, learner_name, dim,
         folds, clusters):
    model = FEATURES[model_name]()
    if model_name == "lda":
        reducer_name = "none"
    reducer = REDUCERS[reducer_name](dim)
    learner = LEARNERS[learner_name]()

    print "model: %s, reducer: %s, learner: %s, reduced dim: %d" \
        % (model_name, reducer_name, learner_name, dim)

    print "opening subreddit file:", subreddit
    df = load_subreddit(subreddit)
    subreddit_name = df["subreddit"][0]
    print "subreddit:", subreddit_name
    print "num rows:", len(df.index)
    print "max upvotes:", features.denormalize_scores([1.], subreddit_name)

    if folds > 0:
        print ">>>>> cross validating with %d folds" % folds
        test_performance(df, model_name, learner_name, reducer_name, folds,
                         dim)
        print ">>>>>"

    # don't bother to produce the training set or reduce dimensionality
    # if we are not providede with a test file or cluster numbers
    if comments == "" and clusters == 0:
        return

    print ">>>>>>"
    print "making training data..."
    X_train, Y_train = model.make_training_xy(df)
    print "done"

    print "reducing dimensionality..."
    reducer.fit(X_train, Y_train)
    X_train_red = reducer.transform(X_train)
    print "done"

    if comments != "":
        print "training learner..."
        learner.train(X_train_red, Y_train)
        print "done"

        print "getting test data from %s ..." % comments
        testfile = open(comments, "rb")
        testdata = testfile.readlines()
        testdata = [line.strip() for line in testdata]
        testfile.close()
        new_df = pd.DataFrame({
            'body': testdata,
            'subreddit': [subreddit_name] * len(testdata)
        })
        X_test = model.data_to_x(new_df)
        X_test_red = reducer.transform(X_test)
        print "done"

        print "predicting test labels..."
        Y_test = learner.predict(X_test_red)
        Y_upvotes = model.y_to_label(df, Y_test)
        print "done"

        print ""
        print ">>>>> RESULTS"
        for comment, upvote in zip(testdata, Y_upvotes):
            print upvote, comment
        print ">>>>>"
        print

    if clusters > 0:
        print ">>>>> CLUSTERING with %d clusters" % clusters
        unsupervised.cluster_within_subreddit(df, X_train_red, clusters)
Пример #4
0
def main(subreddit, comments, model_name, reducer_name, learner_name, dim, folds, clusters):
    model = FEATURES[model_name]()
    if model_name == "lda":
        reducer_name = "none"
    reducer = REDUCERS[reducer_name](dim)
    learner = LEARNERS[learner_name]()

    print "model: %s, reducer: %s, learner: %s, reduced dim: %d" % (model_name, reducer_name, learner_name, dim)

    print "opening subreddit file:", subreddit
    df = load_subreddit(subreddit)
    subreddit_name = df["subreddit"][0]
    print "subreddit:", subreddit_name
    print "num rows:", len(df.index)
    print "max upvotes:", features.denormalize_scores([1.0], subreddit_name)

    if folds > 0:
        print ">>>>> cross validating with %d folds" % folds
        test_performance(df, model_name, learner_name, reducer_name, folds, dim)
        print ">>>>>"

    # don't bother to produce the training set or reduce dimensionality
    # if we are not providede with a test file or cluster numbers
    if comments == "" and clusters == 0:
        return

    print ">>>>>>"
    print "making training data..."
    X_train, Y_train = model.make_training_xy(df)
    print "done"

    print "reducing dimensionality..."
    reducer.fit(X_train, Y_train)
    X_train_red = reducer.transform(X_train)
    print "done"

    if comments != "":
        print "training learner..."
        learner.train(X_train_red, Y_train)
        print "done"

        print "getting test data from %s ..." % comments
        testfile = open(comments, "rb")
        testdata = testfile.readlines()
        testdata = [line.strip() for line in testdata]
        testfile.close()
        new_df = pd.DataFrame({"body": testdata, "subreddit": [subreddit_name] * len(testdata)})
        X_test = model.data_to_x(new_df)
        X_test_red = reducer.transform(X_test)
        print "done"

        print "predicting test labels..."
        Y_test = learner.predict(X_test_red)
        Y_upvotes = model.y_to_label(df, Y_test)
        print "done"

        print ""
        print ">>>>> RESULTS"
        for comment, upvote in zip(testdata, Y_upvotes):
            print upvote, comment
        print ">>>>>"
        print

    if clusters > 0:
        print ">>>>> CLUSTERING with %d clusters" % clusters
        unsupervised.cluster_within_subreddit(df, X_train_red, clusters)