# identified by not having a certain month in their filenames and depending on # parameter settings can include: author, sentiments, lda_prep, nn_prep, # original_comm, original_indices, Random_Count_Dict, Random_Count_List, # random_indices, RC_Count_Dict, RC_Count_List, total_count and votes theparser = Parser() # create the relevant subfolders for storing dataset attributes theparser.safe_dir_create() theparser.Parse_Rel_RC_Comments() if Neural_Relevance_Filtering: # Use a transformer-based neural network trained on human ratings to prune the # dataset from irrelevant posts. Path will default to the Human_Ratings folder theparser.Neural_Relevance_Screen() # Needs results from Neural_Relevance_Screen theparser.Neural_Relevance_Clean() # Filter the dataset based on whether posts are in English (uses Google's # language detection) # NOTE: Requires original text of comments to be available on disk # NOTE: Should be separately run for LDA and NN, as their preprocessed comments # are stored in separate files # NOTE: Performance is significantly worse for shorter posts. By default, # the filtering is only performed on posts that contain at least 20 words theparser.lang_filtering() # TODO: Run the function for alternative sentiment estimates after this ## TextBlob sentiment analysis is integrated into parsing. If requested and not
# To simplify the coding, I should just feed in consecutive IDs of each 24 months # through the sbatch file. In other words: # The batch IDs should be determined as follows: 0 for (2008,1), then +1 for # each month after. # BUG: Because of a hacky solution within Neural_Relevance_Clean(), the function # would only work properly for fully consecutive set of months within self.dates # TODO: make it more general ### import the required modules and functions import time import sys from Utils import Write_Performance from config import * #from ModelEstimation import NNModel from transformers import BertTokenizer from NN_Utils import * from reddit_parser import Parser # Does the parser object need to be adjusted? # NOTE: Feed machine="local" as an argument if not running through the cluster theparser = Parser() # Use a transformer-based neural network trained on human ratings to prune the # dataset from irrelevant posts. Path will default to the Human_Ratings folder theparser.Neural_Relevance_Screen(batch_size=1200)