예제 #1
0
def get_all_values_jan_4():
    """Defines all parameters for the entire experiment"""
    subreddits = ["mcgill"]

    year = 2016
    start_month = 1
    end_month = 2

    ngrams = 4
    text_min = 10 # TODO: make sure this is being used in all the right places
    text_max = 1000

    # TODO: are these truly the values that you want?
    # values that define if you restrict value calculation to just a certain
    # subreddit
    prior_interaction_subreddit = None
    user_prolificness_subreddit = None
    user_karma_subreddit = None

    relevant_categories = [] # TODO: fill this out
    base_path = "/home/ndg/projects/shared_datasets/reddit-style/"

    out_file = "/home/ndg/projects/shared_datasets/reddit-style/data/get_all_values_jan_4.csv"

    get_features.write_to_csv(subreddits, year, start_month, end_month, ngrams, text_min,
                              text_max, base_path, relevant_categories, out_file,
                              user_prolificness_subreddit, user_karma_subreddit,
                              prior_interaction_subreddit)
예제 #2
0
def test_dask_large():
    """Defines all parameters for the entire experiment"""
    with open("../data/large_subs.txt") as f:
        content = f.readlines()
    content = [x.strip() for x in content]

#    subreddits = content[:3]

    partitioned_subreddits = partition(content, 10)

    year = 2016

    start_month_pairs = 4
    end_month_pairs = 4

    num_months_back = 1

    ngrams = 5
    text_min = 5 # TODO: make sure this is being used in all the right places
    text_max = 10000

    num_pairs_cap = 5000
    num_pairs_min = 100

    # TODO: are these truly the values that you want?
    # values that define if you restrict value calculation to just a certain
    # subreddit

    restrict_to_subreddit_only = False

    relevant_categories = ["ppron", "i", "we", "you", "shehe", "they" "ipron",
                           "article", "prep", "auxverb",
                           "conj", "negate", "verb", "adj", "compare",
                           "interrog", "number", "quant", "posemo", "negemo",
                           "anx", "anger", "sad"]
    base_path = "/home/ndg/projects/shared_datasets/reddit-style/"

    for i in range(len(partitioned_subreddits)):
        subreddits = partitioned_subreddits[i]
        print subreddits
        out_file = "/home/ndg/projects/shared_datasets/reddit-style" \
                   "/output_data/TESTDASKlarge_subs_{}_get_all_values_jan_27_{}_{" \
                   "}_{" \
                   "}_{}_{}_{}.csv".format(i,
                                              year, start_month_pairs,
                                              end_month_pairs -
                                              num_months_back,
                                              ngrams,
                                              text_min, text_max)

        get_features.write_to_csv(subreddits, year, start_month_pairs,
                                  end_month_pairs, ngrams, text_min,
                                  text_max, base_path, relevant_categories,
                                  out_file, restrict_to_subreddit_only,
                                  num_pairs_cap, num_pairs_min,
                                  num_months_back)
예제 #3
0
def get_all_values_jan_22():
    """Defines all parameters for the entire experiment"""
    with open("../data/large_subs.txt") as f:
        content = f.readlines()
    content = [x.strip() for x in content]

    subreddits = content[:100]

    year = 2016

    start_month_pairs = 4
    end_month_pairs = 4

    start_month_metadata = 1
    end_month_metadata = 4

    ngrams = 5
    text_min = 0 # TODO: make sure this is being used in all the right places
    text_max = 10000

    num_pairs_cap = 10000
    num_pairs_min = 1000

    # TODO: are these truly the values that you want?
    # values that define if you restrict value calculation to just a certain
    # subreddit

    restrict_to_subreddit_only = False

    relevant_categories = ["ppron", "i", "we", "you", "shehe", "they" "ipron",
                           "article", "prep", "auxverb",
                           "conj", "negate", "verb", "adj", "compare",
                           "interrog", "number", "quant", "posemo", "negemo",
                           "anx", "anger", "sad"]
    base_path = "/home/ndg/projects/shared_datasets/reddit-style/"

    out_file = "/home/ndg/projects/shared_datasets/reddit-style/output_data/large_subs_get_all_values_jan_23_{}_{}_{}_{}_{}_{}.csv".format(year, start_month_pairs, end_month_pairs, ngrams, text_min, text_max)

    # language_model.create_subreddit_language_models(subreddits, year,
    #                                                 start_month_pairs, end_month_pairs,
    #                                                 ngrams, text_min, text_max,
    #                                                 base_path)
    get_features.write_to_csv(subreddits, year, start_month_pairs, end_month_pairs, start_month_metadata, end_month_metadata, ngrams, text_min,
                              text_max, base_path, relevant_categories, out_file, restrict_to_subreddit_only, num_pairs_cap, num_pairs_min)