try: review_limit = int(args.review_limit) except ValueError: raise Exception("Review limit must be a number") if review_limit < 100: raise Exception("Review limit must be over 100") # step 1 - pre processing the training data # convert to combined pandas dataframe # remving stopwords and stemming the review text pre_processing = PreProcessing(limit_reviews=review_limit) df_reviews = pre_processing.get_df_reviews() df_meta = pre_processing.get_df_meta() combined = pre_processing.filter_and_combine(df_reviews, df_meta) reviews_clean = pre_processing.preprocess_reviews( combined['reviewTextProcessed'].tolist()) no_stop_words = pre_processing.remove_stop_words(reviews_clean) stemmed_reviews = pre_processing.get_stemmed_text(no_stop_words) combined['reviewTextProcessed'] = stemmed_reviews combined = pre_processing.change_categories_column(combined) combined.to_csv(args.output_file, sep='\t', encoding='utf-8') #pickle the list of preprocessed reviews to file # with open(args.output_file, 'wb') as fp: # pickle.dump(stemmed_reviews, fp)
except ValueError: raise Exception("Review limit must be a number") if review_limit < 100: raise Exception("Review limit must be over 100") # step 1 - pre processing the training data # convert to combined pandas dataframe # remving stopwords and stemming the review text pre_processing = PreProcessing(limit_reviews=review_limit) df_reviews = pre_processing.get_df_reviews() df_meta = pre_processing.get_df_meta() combined = pre_processing.filter_and_combine(df_reviews, df_meta) combined['reviewTextProcessed'] = pre_processing.preprocess_reviews( combined['reviewTextProcessed']) combined['reviewTextProcessed'] = pre_processing.remove_stop_words( combined['reviewTextProcessed']) combined['reviewTextProcessed'] = pre_processing.get_stemmed_text( combined['reviewTextProcessed']) reviews_and_sentiment = combined[['reviewTextProcessed', 'overall']] # convert string rating values to numerical values reviews_and_sentiment['overall'] = pd.to_numeric( reviews_and_sentiment['overall']) # convert the rating value to 1 or 0 (sentiment value) # if the average rating is 1, 2, 3 then 0 (negative sentiment) # if the average rating is 4 or 5 then 1 (positive sentiment) reviews_and_sentiment['sentiment'] = reviews_and_sentiment[