Exemplo n.º 1
0
def get_active_subreddits(search_freq=5):
    """

    """
    ## Initialize reddit Class
    reddit = RedditData(False)
    ## Retrieve Active Subreddits
    logger.info("Retrieving Active Subreddits")
    active_subreddits = reddit.identify_active_subreddits(
        START_DATE, END_DATE, search_freq=search_freq)
    return active_subreddits
Exemplo n.º 2
0
def reddit_praw():
    """

    """
    ## Initialize with PRAW
    reddit = RedditData(init_praw=True)
    return reddit
Exemplo n.º 3
0
def reddit_psaw():
    """

    """
    ## Initialize With PSAW
    reddit = RedditData(init_praw=False)
    return reddit
Exemplo n.º 4
0
def get_user_item_matrix(active_subreddits,
                         cache_dir,
                         history_type="comment"):
    """

    """
    ## Initialize reddit Class
    reddit = RedditData(False)
    ## Collect Post Histories
    subreddits = active_subreddits.index.tolist()
    total_s = len(subreddits)
    for s, subreddit in enumerate(subreddits):
        logger.info("Collecting Subreddit {}/{}: {}".format(s+1, total_s, subreddit))
        subreddit_file = f"{cache_dir}{subreddit}.tar.gz"
        if os.path.exists(subreddit_file):
            continue
        author_post_history = reddit.retrieve_subreddit_user_history(subreddit,
                                                                     start_date=START_DATE,
                                                                     end_date=END_DATE,
                                                                     history_type=history_type,
                                                                     docs_per_chunk=5000)
        with gzip.open(subreddit_file, "wt") as the_file:
            if author_post_history is not None:
                json.dump(author_post_history.to_dict(), the_file)
            else:
                json.dump({}, the_file)
    ## Identify Unique Redditors
    redditors = set()
    history_files = sorted(glob(cache_dir + "*.tar.gz"))
    for h in history_files:
        with gzip.open(h, "r") as the_file:
            h_data = json.load(the_file)
            redditors.update(set(h_data))
    redditors = sorted(redditors)
    ## Create User-Item Matrix
    dvec = create_dict_vectorizer(redditors)
    X = []
    for h in history_files:
        with gzip.open(h, "r") as the_file:
            h_data = json.load(the_file)
            x = dvec.transform(h_data)
            X.append(x)
    ## Format
    X = vstack(X)
    rows = list(map(lambda h: os.path.basename(h)[:-7], history_files))
    columns = redditors
    return X, rows, columns
Exemplo n.º 5
0
def main():
    """

    """
    ## Directory Setup
    if not os.path.exists(CACHE_DIR):
        os.makedirs(CACHE_DIR)
    temp_dir = f"{CACHE_DIR}temp/"
    if not os.path.exists(temp_dir):
        os.makedirs(temp_dir)
    ## Retrieve Active Subreddits
    if SUBREDDIT_LIST is not None:
        active_subreddits = pd.read_csv(SUBREDDIT_LIST, low_memory=True, index_col=0)["0"]
    else:
        subreddit_out_file = f"{main_cache_dir}active_subreddits.csv"
        active_subreddits = get_active_subreddits()
        active_subreddits.to_csv(subreddit_out_file)
    ## Retrieve Subreddit Metadata
    reddit = RedditData(True)
    for subreddit in tqdm(active_subreddits.index.tolist(),
                          file=sys.stdout,
                          total=len(active_subreddits),
                          desc="Subreddit"):
        sub_temp_file = f"{temp_dir}{subreddit}.json"
        if os.path.exists(sub_temp_file):
            continue
        sub_metadata = reddit.retrieve_subreddit_metadata(subreddit)
        if sub_metadata is None:
            continue
        with open(sub_temp_file, "w") as the_file:
            json.dump(sub_metadata, the_file)
    ## Concatenate Data
    cached_files = glob(temp_dir + "*")
    metadata = pd.DataFrame([json.load(open(f,"r")) for f in cached_files])
    metadata = metadata.sort_values("subscribers",ascending=False).reset_index(drop=True)
    ## Cache
    metadata.to_csv(f"{CACHE_DIR}subreddit_metadata.csv")
    logger.info("Script Complete")
Exemplo n.º 6
0
def retrieve_language_samples(active_subreddits,
                              cache_dir,
                              sample_type="comment"):
    """

    """
    ## Subreddits Meeting Criteria
    subreddits = active_subreddits[active_subreddits >= MIN_ACTIVITY]
    ## Initialize Reddit
    reddit = RedditData(False)
    ## Retreve Samples for Each Subreddit
    for subreddit, sample in tqdm(subreddits.iteritems(),
                                  total=len(subreddits),
                                  file=sys.stdout):
        ## Sample File
        sample_file = f"{cache_dir}{subreddit}.json"
        if os.path.exists(sample_file):
            continue
        ## Query Based on Data Type
        if sample_type == "comment":
            endpoint = reddit.search_for_comments
        elif sample_type == "submission":
            endpoint = reddit.search_for_submissions
        else:
            raise ValueError(
                "Expected sample_type to be either comment or submission")
        df = endpoint(None,
                      subreddit=subreddit,
                      start_date=START_DATE,
                      end_date=END_DATE,
                      limit=SAMPLE_SIZE)
        ## Extract Text Samples
        text_samples = dict()
        if len(df) > 0 and sample_type == "comment":
            text_samples = dict(zip(df["id"].tolist(), df["body"].tolist()))
        elif len(df) > 0 and sample_type == "submission":
            text_samples = dict(
                zip(df["id"].tolist(),
                    (df["title"] + " " + df["selftext"]).tolist()))
        ## Cache Samples
        with open(sample_file, "w") as the_file:
            json.dump(text_samples, the_file)
def update_user_histories(active_users):
    """

    """
    ## Get Today's Date
    today = datetime.now().date()
    ## Initialize Connection
    user_history_con = sql.connect(USER_HISTORY_DB_PATH)
    cursor = user_history_con.cursor()
    ## Get User Query Dates
    query_dates = {}
    max_date_command = """
    SELECT USER, QUERY_END_DATE
    FROM HISTORY
    WHERE USER='******'
    ORDER BY QUERY_END_DATE
    LIMIT 1;"""
    for user in tqdm(active_users,
                     total=len(active_users),
                     desc="Query Periods",
                     file=sys.stdout):
        res = cursor.execute(max_date_command.format(user))
        result = res.fetchall()
        if len(result) == 0:
            query_dates[user] = (GLOBAL_START_DATE, today.isoformat())
        else:
            if result[0][1] != today.isoformat():
                query_dates[user] = (result[0][1], today.isoformat())
    ## Initialize Reddit Wrapper
    reddit = RedditData()
    ## Query Comment History
    user_comment_histories = []
    for user, (start, stop) in tqdm(query_dates.items(),
                                    total=len(query_dates),
                                    file=sys.stdout,
                                    desc="User Histories"):
        df = reddit.retrieve_author_comments(user,
                                             start_date=start,
                                             end_date=stop)
        try:
            subreddit_counts = df.groupby(
                ["author"])["subreddit"].value_counts().rename(
                    "COMMENT_COUNT").reset_index()
        except:
            print('failed user:', user)
            import pdb
            pdb.set_trace()
        subreddit_counts["QUERY_START_DATE"] = start
        subreddit_counts["QUERY_END_DATE"] = stop
        subreddit_counts.rename(columns={
            "author": "USER",
            "subreddit": "SUBREDDIT"
        },
                                inplace=True)
        user_comment_histories.append(subreddit_counts)
    ## Update Database
    if len(user_comment_histories) > 0:
        user_comment_histories = pd.concat(user_comment_histories).reset_index(
            drop=True)
        user_comment_histories.to_sql(
            name="HISTORY",
            con=user_history_con,
            if_exists="append",
            index=False,
        )
    ## Close Connection
    user_history_con.commit()
    user_history_con.close()
Exemplo n.º 8
0
## Search Terms
SEARCH_TERMS = [
                "sorry for the bad english",
                "not my native",
                "still learning english"
                ]

STRICT_FILTER = False

##################
### Query Data
##################

## Initialize Reddit Data Wrapper
reddit = RedditData(False)

## Cycle Through Search Terms
search_results = []
for ST in tqdm(SEARCH_TERMS, total=len(SEARCH_TERMS), file=sys.stdout):
    st_results = reddit.search_for_comments(query=ST,
                                            subreddit=None,
                                            start_date=START_DATE,
                                            end_date=END_DATE)
    st_results["search_term"] = ST
    if STRICT_FILTER:
        st_results = st_results.loc[st_results["body"].str.lower().str.contains(ST)]
    search_results.append(st_results)

## Concatenate Data
search_results = pd.concat(search_results)
Exemplo n.º 9
0
    'uk': Ukrainian(),
    'ro': Romanian()
}

#####################
### Globals
#####################

reddit = Reddit(client_id='OFsSWAsbFrzLpg',
                client_secret='tRReu7VAAyxgEXbGqaE19_OUrR4',
                password='******',
                user_agent='testscript by /u/pocaguirre',
                username='******')

## Initialize PSAW (Fast Reddit Data Queries)
psaw = RedditData()

## Initialize Recommendation Model
REC_MODEL_PATH = "./rrec/models/comments_20200221_20200228.cf"
recommender = None

## Recommendation Database Paths
DB_PATH = "./rrec/data/db/"
USER_HISTORY_DB_PATH = f"{DB_PATH}user_history.db"
RECOMMENDATION_HISTORY_DB_PATH = f"{DB_PATH}recommendations.db"

## Subreddit Thumbnails
with open("subreddit_thumbnails.json", "r") as f:
    subreddit_list = json.load(f)
    subreddit_df = pd.DataFrame(subreddit_list)
Exemplo n.º 10
0
###################
### Load Metadata
###################

## Load Metadata
metadata = pd.read_csv("./data/raw/metadata/subreddit_metadata.csv",
                       low_memory=False,
                       index_col=0)
names = metadata["name"].tolist()

###################
### Query Image URLs
###################

## Query Data
reddit = RedditData(True)
img_urls = {}
for sub in tqdm(reddit._praw.info(fullnames=names), total=len(names)):
    display_name = sub.display_name
    img_urls[display_name] = {
        "icon_img": sub.icon_img,
        "header_img": sub.header_img,
        "banner_img": sub.banner_img,
        "community_icon": sub.community_icon,
    }

###################
### Format
###################

## Format Into DataFrame
Exemplo n.º 11
0
subreddit_mask = np.nonzero((X_masked > 0).sum(axis=1) >= MIN_SUPPORT)[0]
X_masked = X_masked[subreddit_mask]
rows_masked = [rows[i] for i in subreddit_mask]

## Weight Using BM25
if BM25_WEIGHTING:
    X_masked = bm25_weight(X_masked).tocsr()

## Fit Model
cf = CollaborativeFiltering(factors=N_FACTORS,
                            regularization=REGULARIZATION,
                            iterations=ITERATIONS,
                            num_threads=NUM_THREADS,
                            random_state=RANDOM_STATE)
cf = cf.fit(X_masked, rows=rows_masked, columns=columns_masked)

#####################
### Testing
#####################

## Test Recommendations
reddit = RedditData()
keith = reddit.retrieve_author_comments("HuskyKeith")
keith_counts = keith["subreddit"].tolist()
keith_recs = cf.recommend(keith_counts, 20)

## Test Similarity
cf.get_similar_item("movies")

## Dump Model
cf.dump(f"{MODEL_DIR}{MODEL_NAME}")