Exemplo n.º 1
0
    try:
        review_limit = int(args.review_limit)
    except ValueError:
        raise Exception("Review limit must be a number")

    if review_limit < 100:
        raise Exception("Review limit must be over 100")

    # step 1 - pre processing the training data
    # convert to combined pandas dataframe
    # remving stopwords and stemming the review text
    pre_processing = PreProcessing(limit_reviews=review_limit)

    df_reviews = pre_processing.get_df_reviews()
    df_meta = pre_processing.get_df_meta()

    combined = pre_processing.filter_and_combine(df_reviews, df_meta)
    reviews_clean = pre_processing.preprocess_reviews(
        combined['reviewTextProcessed'].tolist())
    no_stop_words = pre_processing.remove_stop_words(reviews_clean)
    stemmed_reviews = pre_processing.get_stemmed_text(no_stop_words)

    combined['reviewTextProcessed'] = stemmed_reviews
    combined = pre_processing.change_categories_column(combined)

    combined.to_csv(args.output_file, sep='\t', encoding='utf-8')

    #pickle the list of preprocessed reviews to file
    # with open(args.output_file, 'wb') as fp:
    #     pickle.dump(stemmed_reviews, fp)
    if review_limit < 100:
        raise Exception("Review limit must be over 100")

    # step 1 - pre processing the training data
    # convert to combined pandas dataframe
    # remving stopwords and stemming the review text
    pre_processing = PreProcessing(limit_reviews=review_limit)

    df_reviews = pre_processing.get_df_reviews()
    df_meta = pre_processing.get_df_meta()

    combined = pre_processing.filter_and_combine(df_reviews, df_meta)
    combined['reviewTextProcessed'] = pre_processing.preprocess_reviews(
        combined['reviewTextProcessed'])
    combined['reviewTextProcessed'] = pre_processing.remove_stop_words(
        combined['reviewTextProcessed'])
    combined['reviewTextProcessed'] = pre_processing.get_stemmed_text(
        combined['reviewTextProcessed'])

    reviews_and_sentiment = combined[['reviewTextProcessed', 'overall']]

    # convert string rating values to numerical values
    reviews_and_sentiment['overall'] = pd.to_numeric(
        reviews_and_sentiment['overall'])

    # convert the rating value to 1 or 0 (sentiment value)
    # if the average rating is 1, 2, 3 then 0 (negative sentiment)
    # if the average rating is 4 or 5 then 1 (positive sentiment)
    reviews_and_sentiment['sentiment'] = reviews_and_sentiment[
        'overall'].apply(lambda x: 1 if x > 3 else 0)
    reviews_and_sentiment['sentiment'] = [