def get_active_subreddits(search_freq=5): """ """ ## Initialize reddit Class reddit = RedditData(False) ## Retrieve Active Subreddits logger.info("Retrieving Active Subreddits") active_subreddits = reddit.identify_active_subreddits( START_DATE, END_DATE, search_freq=search_freq) return active_subreddits
def reddit_praw(): """ """ ## Initialize with PRAW reddit = RedditData(init_praw=True) return reddit
def reddit_psaw(): """ """ ## Initialize With PSAW reddit = RedditData(init_praw=False) return reddit
def get_user_item_matrix(active_subreddits, cache_dir, history_type="comment"): """ """ ## Initialize reddit Class reddit = RedditData(False) ## Collect Post Histories subreddits = active_subreddits.index.tolist() total_s = len(subreddits) for s, subreddit in enumerate(subreddits): logger.info("Collecting Subreddit {}/{}: {}".format(s+1, total_s, subreddit)) subreddit_file = f"{cache_dir}{subreddit}.tar.gz" if os.path.exists(subreddit_file): continue author_post_history = reddit.retrieve_subreddit_user_history(subreddit, start_date=START_DATE, end_date=END_DATE, history_type=history_type, docs_per_chunk=5000) with gzip.open(subreddit_file, "wt") as the_file: if author_post_history is not None: json.dump(author_post_history.to_dict(), the_file) else: json.dump({}, the_file) ## Identify Unique Redditors redditors = set() history_files = sorted(glob(cache_dir + "*.tar.gz")) for h in history_files: with gzip.open(h, "r") as the_file: h_data = json.load(the_file) redditors.update(set(h_data)) redditors = sorted(redditors) ## Create User-Item Matrix dvec = create_dict_vectorizer(redditors) X = [] for h in history_files: with gzip.open(h, "r") as the_file: h_data = json.load(the_file) x = dvec.transform(h_data) X.append(x) ## Format X = vstack(X) rows = list(map(lambda h: os.path.basename(h)[:-7], history_files)) columns = redditors return X, rows, columns
def main(): """ """ ## Directory Setup if not os.path.exists(CACHE_DIR): os.makedirs(CACHE_DIR) temp_dir = f"{CACHE_DIR}temp/" if not os.path.exists(temp_dir): os.makedirs(temp_dir) ## Retrieve Active Subreddits if SUBREDDIT_LIST is not None: active_subreddits = pd.read_csv(SUBREDDIT_LIST, low_memory=True, index_col=0)["0"] else: subreddit_out_file = f"{main_cache_dir}active_subreddits.csv" active_subreddits = get_active_subreddits() active_subreddits.to_csv(subreddit_out_file) ## Retrieve Subreddit Metadata reddit = RedditData(True) for subreddit in tqdm(active_subreddits.index.tolist(), file=sys.stdout, total=len(active_subreddits), desc="Subreddit"): sub_temp_file = f"{temp_dir}{subreddit}.json" if os.path.exists(sub_temp_file): continue sub_metadata = reddit.retrieve_subreddit_metadata(subreddit) if sub_metadata is None: continue with open(sub_temp_file, "w") as the_file: json.dump(sub_metadata, the_file) ## Concatenate Data cached_files = glob(temp_dir + "*") metadata = pd.DataFrame([json.load(open(f,"r")) for f in cached_files]) metadata = metadata.sort_values("subscribers",ascending=False).reset_index(drop=True) ## Cache metadata.to_csv(f"{CACHE_DIR}subreddit_metadata.csv") logger.info("Script Complete")
def retrieve_language_samples(active_subreddits, cache_dir, sample_type="comment"): """ """ ## Subreddits Meeting Criteria subreddits = active_subreddits[active_subreddits >= MIN_ACTIVITY] ## Initialize Reddit reddit = RedditData(False) ## Retreve Samples for Each Subreddit for subreddit, sample in tqdm(subreddits.iteritems(), total=len(subreddits), file=sys.stdout): ## Sample File sample_file = f"{cache_dir}{subreddit}.json" if os.path.exists(sample_file): continue ## Query Based on Data Type if sample_type == "comment": endpoint = reddit.search_for_comments elif sample_type == "submission": endpoint = reddit.search_for_submissions else: raise ValueError( "Expected sample_type to be either comment or submission") df = endpoint(None, subreddit=subreddit, start_date=START_DATE, end_date=END_DATE, limit=SAMPLE_SIZE) ## Extract Text Samples text_samples = dict() if len(df) > 0 and sample_type == "comment": text_samples = dict(zip(df["id"].tolist(), df["body"].tolist())) elif len(df) > 0 and sample_type == "submission": text_samples = dict( zip(df["id"].tolist(), (df["title"] + " " + df["selftext"]).tolist())) ## Cache Samples with open(sample_file, "w") as the_file: json.dump(text_samples, the_file)
def update_user_histories(active_users): """ """ ## Get Today's Date today = datetime.now().date() ## Initialize Connection user_history_con = sql.connect(USER_HISTORY_DB_PATH) cursor = user_history_con.cursor() ## Get User Query Dates query_dates = {} max_date_command = """ SELECT USER, QUERY_END_DATE FROM HISTORY WHERE USER='******' ORDER BY QUERY_END_DATE LIMIT 1;""" for user in tqdm(active_users, total=len(active_users), desc="Query Periods", file=sys.stdout): res = cursor.execute(max_date_command.format(user)) result = res.fetchall() if len(result) == 0: query_dates[user] = (GLOBAL_START_DATE, today.isoformat()) else: if result[0][1] != today.isoformat(): query_dates[user] = (result[0][1], today.isoformat()) ## Initialize Reddit Wrapper reddit = RedditData() ## Query Comment History user_comment_histories = [] for user, (start, stop) in tqdm(query_dates.items(), total=len(query_dates), file=sys.stdout, desc="User Histories"): df = reddit.retrieve_author_comments(user, start_date=start, end_date=stop) try: subreddit_counts = df.groupby( ["author"])["subreddit"].value_counts().rename( "COMMENT_COUNT").reset_index() except: print('failed user:', user) import pdb pdb.set_trace() subreddit_counts["QUERY_START_DATE"] = start subreddit_counts["QUERY_END_DATE"] = stop subreddit_counts.rename(columns={ "author": "USER", "subreddit": "SUBREDDIT" }, inplace=True) user_comment_histories.append(subreddit_counts) ## Update Database if len(user_comment_histories) > 0: user_comment_histories = pd.concat(user_comment_histories).reset_index( drop=True) user_comment_histories.to_sql( name="HISTORY", con=user_history_con, if_exists="append", index=False, ) ## Close Connection user_history_con.commit() user_history_con.close()
## Search Terms SEARCH_TERMS = [ "sorry for the bad english", "not my native", "still learning english" ] STRICT_FILTER = False ################## ### Query Data ################## ## Initialize Reddit Data Wrapper reddit = RedditData(False) ## Cycle Through Search Terms search_results = [] for ST in tqdm(SEARCH_TERMS, total=len(SEARCH_TERMS), file=sys.stdout): st_results = reddit.search_for_comments(query=ST, subreddit=None, start_date=START_DATE, end_date=END_DATE) st_results["search_term"] = ST if STRICT_FILTER: st_results = st_results.loc[st_results["body"].str.lower().str.contains(ST)] search_results.append(st_results) ## Concatenate Data search_results = pd.concat(search_results)
'uk': Ukrainian(), 'ro': Romanian() } ##################### ### Globals ##################### reddit = Reddit(client_id='OFsSWAsbFrzLpg', client_secret='tRReu7VAAyxgEXbGqaE19_OUrR4', password='******', user_agent='testscript by /u/pocaguirre', username='******') ## Initialize PSAW (Fast Reddit Data Queries) psaw = RedditData() ## Initialize Recommendation Model REC_MODEL_PATH = "./rrec/models/comments_20200221_20200228.cf" recommender = None ## Recommendation Database Paths DB_PATH = "./rrec/data/db/" USER_HISTORY_DB_PATH = f"{DB_PATH}user_history.db" RECOMMENDATION_HISTORY_DB_PATH = f"{DB_PATH}recommendations.db" ## Subreddit Thumbnails with open("subreddit_thumbnails.json", "r") as f: subreddit_list = json.load(f) subreddit_df = pd.DataFrame(subreddit_list)
################### ### Load Metadata ################### ## Load Metadata metadata = pd.read_csv("./data/raw/metadata/subreddit_metadata.csv", low_memory=False, index_col=0) names = metadata["name"].tolist() ################### ### Query Image URLs ################### ## Query Data reddit = RedditData(True) img_urls = {} for sub in tqdm(reddit._praw.info(fullnames=names), total=len(names)): display_name = sub.display_name img_urls[display_name] = { "icon_img": sub.icon_img, "header_img": sub.header_img, "banner_img": sub.banner_img, "community_icon": sub.community_icon, } ################### ### Format ################### ## Format Into DataFrame
subreddit_mask = np.nonzero((X_masked > 0).sum(axis=1) >= MIN_SUPPORT)[0] X_masked = X_masked[subreddit_mask] rows_masked = [rows[i] for i in subreddit_mask] ## Weight Using BM25 if BM25_WEIGHTING: X_masked = bm25_weight(X_masked).tocsr() ## Fit Model cf = CollaborativeFiltering(factors=N_FACTORS, regularization=REGULARIZATION, iterations=ITERATIONS, num_threads=NUM_THREADS, random_state=RANDOM_STATE) cf = cf.fit(X_masked, rows=rows_masked, columns=columns_masked) ##################### ### Testing ##################### ## Test Recommendations reddit = RedditData() keith = reddit.retrieve_author_comments("HuskyKeith") keith_counts = keith["subreddit"].tolist() keith_recs = cf.recommend(keith_counts, 20) ## Test Similarity cf.get_similar_item("movies") ## Dump Model cf.dump(f"{MODEL_DIR}{MODEL_NAME}")