def main(): parser = ArgumentParser(usage='python extract_features.py --game_files ' 'GAME_FILE1,GAME_FILE2,...[ OPTIONS]', description='Extract features and add them to the Mongo database.', formatter_class=ArgumentDefaultsHelpFormatter) parser_add_argument = parser.add_argument parser_add_argument('--game_files', help='Comma-separated list of file-names or "all" for all of the ' 'files (the game files should reside in the "data" directory; ' 'the .jsonlines suffix is not necessary, but the file-names ' 'should be exact matches otherwise).', type=str, required=True) parser_add_argument('--do_not_binarize_features', help='Do not make all non-zero feature frequencies equal to 1.', action='store_true', default=False) parser_add_argument('--do_not_lowercase_text', help='Do not make lower-casing part of the review text ' 'normalization step, which affects word n-gram-related ' 'features.', action='store_true', default=False) parser_add_argument('--lowercase_cngrams', help='Lower-case the review text before extracting character n-gram ' 'features.', action='store_true', default=False) parser_add_argument('--partition', help='Data partition, i.e., "training", "test", etc. Value must be a ' 'valid partition set name in the Mongo database. Alternatively, ' 'the value "all" can be used to include all partitions.', type=str, default='all') parser_add_argument('--do_not_reuse_extracted_features', help="Don't make use of previously-extracted features present in the" " Mongo database and instead replace them if they are.", action='store_true', default=False) parser_add_argument('-dbhost', '--mongodb_host', help='Host that the MongoDB server is running on.', type=str, default='localhost') parser_add_argument('-dbport', '--mongodb_port', help='Port that the MongoDB server is running on.', type=int, default=27017) parser_add_argument('-log', '--log_file_path', help='Path to feature extraction log file.', type=str, default=join(project_dir, 'logs', 'replog_extract_features.txt')) args = parser.parse_args() # Imports import logging from util.mongodb import connect_to_db from util.datasets import get_game_files from src.features import extract_nlp_features_into_db # Make local copies of arguments game_files = args.game_files binarize = not args.do_not_binarize_features reuse_features = not args.do_not_reuse_extracted_features lowercase_text = not args.do_not_lowercase_text lowercase_cngrams = args.lowercase_cngrams partition = args.partition mongodb_host = args.mongodb_host mongodb_port = args.mongodb_port # Setup logger and create logging handlers logger = logging.getLogger('extract_features') logging_debug = logging.DEBUG logger.setLevel(logging_debug) loginfo = logger.info logdebug = logger.debug logerr = logger.error logwarn = logger.warning formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s -' ' %(message)s') sh = logging.StreamHandler() sh.setLevel(logging_debug) fh = logging.FileHandler(realpath(args.log_file_path)) fh.setLevel(logging_debug) sh.setFormatter(formatter) fh.setFormatter(formatter) logger.addHandler(sh) logger.addHandler(fh) # Print out some logging information about the upcoming tasks logdebug('Project directory: {}'.format(project_dir)) logdebug('Binarize features? {}'.format(binarize)) logdebug('Try to reuse previously-extracted features in the database? {}' .format(reuse_features)) logdebug('Lower-case text as part of the normalization step? {}' .format(lowercase_text)) logdebug('Lower-case character n-grams during feature extraction? {}' .format(lowercase_cngrams)) # Establish connection to MongoDB database collection loginfo('Connecting to MongoDB database on mongodb://{}:{}...' .format(mongodb_host, mongodb_port)) reviewdb = connect_to_db(host=mongodb_host, port=mongodb_port) reviewdb.write_concern['w'] = 0 # Get list of games game_files = get_game_files(game_files, join(dirname(dirname(__file__)), 'data')) # Iterate over the game files, extracting and adding/replacing # features to the database for game_file in game_files: game = splitext(game_file)[0] if partition == 'all': partition_string = (' from the "training" and "test" data ' 'partitions') else: partition_string = (' from the "{}" data partition' .format(partition)) loginfo('Extracting features{} for {}...' .format(partition_string, game)) extract_nlp_features_into_db(reviewdb, partition, game, reuse_nlp_feats=reuse_features, use_binarized_nlp_feats=binarize, lowercase_text=lowercase_text, lowercase_cngrams=lowercase_cngrams)
def main(): parser = \ ArgumentParser(usage='python make_train_test_sets.py --game_files ' 'GAME_FILE1,GAME_FILE2,...[ OPTIONS]', description='Build train/test sets for each game. Take' ' up to 21k reviews and split it 80/20 ' 'training/test, respectively, by default. ' 'Both the maximum size and the percentage ' 'split can be altered via command-line ' 'flags. All selected reviews will be put ' 'into the "reviews_project" database\'s ' '"reviews" collection (which is being ' ' hosted on lemur.montclair.edu on port ' '27017).', formatter_class=ArgumentDefaultsHelpFormatter) parser_add_argument = parser.add_argument parser_add_argument('--game_files', help='Comma-separated list of file-names or "all" for all of the ' 'files (the game files should reside in the "data" directory).', type=str, required=True) parser_add_argument('--max_size', '-m', help='Maximum number of reviews to get for training/testing (if ' 'possible).', type=int, default=4000) parser_add_argument('--percent_train', '-%', help='Percent of selected reviews for which to use for the training ' 'set, the rest going to the test set.', type=float, default=80.0) parser_add_argument('--convert_to_bins', '-bins', help='Number of sub-divisions of the hours-played values, e.g. if 10 ' 'and the hours values range from 0 up to 1000, then hours values' ' 0-99 will become 1, 100-199 will become 2, etc. (will ' 'probably be necessay to train a model that actually is ' 'predictive to an acceptable degree); note that both hours ' 'values will be retained, the original under the name "hours" ' 'and the converted value under the name "hours_bin".', type=int, required=False) parser_add_argument('--bin_factor', help='If the --convert_to_bins/-bins argument is specified, increase ' 'the sizes of the bins by the given factor so that bins in which' ' there will be lots of instances will be smaller in terms of ' 'range than bins that are more spasely-populated.', type=float, required=False) parser_add_argument('--make_reports', '-describe', help='Generate reports and histograms describing the data filtering ' 'procedure.', action='store_true', default=False) parser_add_argument('--just_describe', help='Generate reports and histograms describing the data filtering ' 'procedure, but then do NOT insert the reviews into the DB.', action='store_true', default=False) parser_add_argument('--reports_dir', help='If -describe/--make_reports is used, put generated reports in ' 'the given directory.', type=str, required=False) parser_add_argument('-dbhost', '--mongodb_host', help='Host that the MongoDB server is running on.', type=str, default='localhost') parser_add_argument('--mongodb_port', '-dbport', help='Port that the MongoDB server is running on.', type=int, default=27017) parser_add_argument('--log_file_path', '-log', help='Path for log file.', type=str, default=join(project_dir, 'logs', 'replog_make_train_test_sets.txt')) args = parser.parse_args() # Imports import logging from sys import exit from os import listdir from pymongo import MongoClient from util.datasets import get_game_files from util.mongodb import (connect_to_db, insert_train_test_reviews) # Make local copies of arguments game_files = args.game_files max_size = args.max_size percent_train = args.percent_train convert_to_bins = args.convert_to_bins bin_factor = args.bin_factor make_reports = args.make_reports just_describe = args.just_describe reports_dir = args.reports_dir mongodb_host = args.mongodb_host mongodb_port = args.mongodb_port # Initialize logging system logging_info = logging.INFO logger = logging.getLogger('make_train_test_sets') logger.setLevel(logging_info) # Create file handler fh = logging.FileHandler(abspath(args.log_file_path)) fh.setLevel(logging_info) # Create console handler sh = logging.StreamHandler() sh.setLevel(logging_info) # Add nicer formatting formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s -' ' %(message)s') fh.setFormatter(formatter) sh.setFormatter(formatter) logger.addHandler(fh) logger.addHandler(sh) loginfo = logger.info logerror = logger.error logwarn = logger.warning # Make sure value passed in via the --convert_to_bins/-bins option # flag makes sense and, if so, assign value to variable bins (if # not, set bins equal to 0) if (convert_to_bins and convert_to_bins < 2): logerror('The value passed in via --convert_to_bins/-bins must be ' 'greater than one since there must be multiple bins to ' 'divide the hours played values. Exiting.') exit(1) elif convert_to_bins: bins = convert_to_bins else: bins = 0 # Make sure that, if the --bin_factor argument is specified, the # --convert_to_bins/-bins argument was also specified if (bin_factor and not convert_to_bins): logerror('The --bin_factor argument was specified despite the fact ' 'that the --convert_to_bins/-bins argument was not used. ' 'Exiting.') exit(1) # Establish connection to MongoDB database loginfo('Connecting to MongoDB database on mongodb://{}:{}...' .format(mongodb_host, mongodb_port)) reviewdb = connect_to_db(host=mongodb_host, port=mongodb_port) reviewdb.write_concern['w'] = 0 # Get path to the directories data_dir = join(project_dir, 'data') if reports_dir: reports_dir = realpath(reports_dir) # Make sure args make sense if max_size < 50: logerror('You can\'t be serious, right? You passed in a value of 50 ' 'for the MAXIMUM size of the combination of training/test ' 'sets? Exiting.') exit(1) if percent_train < 1.0: logerror('You can\'t be serious, right? You passed in a value of 1.0%' ' for the percentage of the selected reviews that will be ' 'devoted to the training set? That is not going to be enough' ' training samples. Exiting.') exit(1) # Make sense of arguments if (make_reports and just_describe): logwarn('If the --just_describe and -describe/--make_reports option ' 'flags are used, --just_describe wins out, i.e., reports will' ' be generated, but no reviews will be inserted into the ' 'database.') elif (reports_dir and (make_reports or just_describe)): if not exists(reports_dir): logerror('The given --reports_dir path was invalid. Exiting.') exit(1) # Get list of games game_files = get_game_files(game_files, join(dirname(dirname(__file__)), 'data')) loginfo('Adding training/test partitions to Mongo DB for the following ' 'games: {}'.format(', '.join([splitext(game)[0] for game in game_files]))) loginfo('Maximum size for the combined training/test sets: {}' .format(max_size)) loginfo('Percentage split between training and test sets: {0:.2f}/{1:.2f}' .format(percent_train, 100.0 - percent_train)) if make_reports: loginfo('Generating reports in {}.' .format(reports_dir if reports_dir else join(data_dir, 'reports'))) if just_describe: loginfo('Exiting after generating reports.') if bins: loginfo('Converting hours played values to {} bins with a bin factor ' 'of {}.'.format(bins, bin_factor)) # For each game in our list of games, we will read in the reviews # from the data file and then put entries in our MongoDB collection # with a key that identifies each review as either training or test for game_file in game_files: loginfo('Getting/inserting reviews for {}...' .format(splitext(basename(game_file))[0])) insert_train_test_reviews(reviewdb, abspath(join(data_dir, game_file)), max_size, percent_train, bins=bins, bin_factor=bin_factor, describe=make_reports, just_describe=just_describe, reports_dir=reports_dir if reports_dir else join(data_dir, 'reports')) loginfo('Complete.')
def main(): parser = ArgumentParser( usage='python extract_features.py --game_files ' 'GAME_FILE1,GAME_FILE2,...[ OPTIONS]', description='Extract features and add them to the Mongo database.', formatter_class=ArgumentDefaultsHelpFormatter) parser_add_argument = parser.add_argument parser_add_argument( '--game_files', help='Comma-separated list of file-names or "all" for all of the ' 'files (the game files should reside in the "data" directory; ' 'the .jsonlines suffix is not necessary, but the file-names ' 'should be exact matches otherwise).', type=str, required=True) parser_add_argument( '--do_not_binarize_features', help='Do not make all non-zero feature frequencies equal to 1.', action='store_true', default=False) parser_add_argument( '--do_not_lowercase_text', help='Do not make lower-casing part of the review text ' 'normalization step, which affects word n-gram-related ' 'features.', action='store_true', default=False) parser_add_argument( '--lowercase_cngrams', help='Lower-case the review text before extracting character n-gram ' 'features.', action='store_true', default=False) parser_add_argument( '--partition', help='Data partition, i.e., "training", "test", etc. Value must be a ' 'valid partition set name in the Mongo database. Alternatively, ' 'the value "all" can be used to include all partitions.', type=str, default='all') parser_add_argument( '--do_not_reuse_extracted_features', help="Don't make use of previously-extracted features present in the" " Mongo database and instead replace them if they are.", action='store_true', default=False) parser_add_argument('-dbhost', '--mongodb_host', help='Host that the MongoDB server is running on.', type=str, default='localhost') parser_add_argument('-dbport', '--mongodb_port', help='Port that the MongoDB server is running on.', type=int, default=27017) parser_add_argument('-log', '--log_file_path', help='Path to feature extraction log file.', type=str, default=join(project_dir, 'logs', 'replog_extract_features.txt')) args = parser.parse_args() # Imports import logging from util.mongodb import connect_to_db from util.datasets import get_game_files from src.features import extract_nlp_features_into_db # Make local copies of arguments game_files = args.game_files binarize = not args.do_not_binarize_features reuse_features = not args.do_not_reuse_extracted_features lowercase_text = not args.do_not_lowercase_text lowercase_cngrams = args.lowercase_cngrams partition = args.partition mongodb_host = args.mongodb_host mongodb_port = args.mongodb_port # Setup logger and create logging handlers logger = logging.getLogger('extract_features') logging_debug = logging.DEBUG logger.setLevel(logging_debug) loginfo = logger.info logdebug = logger.debug logerr = logger.error logwarn = logger.warning formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s -' ' %(message)s') sh = logging.StreamHandler() sh.setLevel(logging_debug) fh = logging.FileHandler(realpath(args.log_file_path)) fh.setLevel(logging_debug) sh.setFormatter(formatter) fh.setFormatter(formatter) logger.addHandler(sh) logger.addHandler(fh) # Print out some logging information about the upcoming tasks logdebug('Project directory: {}'.format(project_dir)) logdebug('Binarize features? {}'.format(binarize)) logdebug('Try to reuse previously-extracted features in the database? {}'. format(reuse_features)) logdebug('Lower-case text as part of the normalization step? {}'.format( lowercase_text)) logdebug( 'Lower-case character n-grams during feature extraction? {}'.format( lowercase_cngrams)) # Establish connection to MongoDB database collection loginfo('Connecting to MongoDB database on mongodb://{}:{}...'.format( mongodb_host, mongodb_port)) reviewdb = connect_to_db(host=mongodb_host, port=mongodb_port) reviewdb.write_concern['w'] = 0 # Get list of games game_files = get_game_files(game_files, join(dirname(dirname(__file__)), 'data')) # Iterate over the game files, extracting and adding/replacing # features to the database for game_file in game_files: game = splitext(game_file)[0] if partition == 'all': partition_string = (' from the "training" and "test" data ' 'partitions') else: partition_string = ( ' from the "{}" data partition'.format(partition)) loginfo('Extracting features{} for {}...'.format( partition_string, game)) extract_nlp_features_into_db(reviewdb, partition, game, reuse_nlp_feats=reuse_features, use_binarized_nlp_feats=binarize, lowercase_text=lowercase_text, lowercase_cngrams=lowercase_cngrams)
def main(): parser = ArgumentParser( usage='python make_arff_files.py --game_files GAME_FILE1,GAME_FILE2[ ' 'OPTIONS]', description='Build .arff files for a specific game file, all game ' 'files combined, or for each game file separately.', formatter_class=ArgumentDefaultsHelpFormatter) parser_add_argument = parser.add_argument parser_add_argument('--game_files', help='Comma-separated list of file-names or "all" for all of the ' 'files (the game files should reside in the "data" directory; ' 'the .jsonlines suffix is not necessary, but the file-names ' 'should be exact matches otherwise).', type=str, required=True) parser_add_argument('--output_dir', '-o', help='Destination directory for ARFF files.', type=str, required=True) parser_add_argument('--mode', help='Make .arff file for each game file separately ("separate") or ' 'for all game files combined ("combined").', choices=["separate", "combined"], default="combined") parser_add_argument('--combined_file_prefix', help='If the "combined" value was passed in via the --mode flag ' '(which happens by default unless specified otherwise), an ' 'output file prefix must be passed in via this option flag.', type=str, required=False) parser_add_argument('--use_original_hours_values', help='Use the unmodified hours played values; otherwise, use the ' 'collapsed values.', action='store_true', default=False) parser_add_argument('--use_mongodb', help='Search the MongoDB collection for training/test set reviews and' ' make ARFF files using them only (the file suffix ".train"/' '".test" will be appended onto the end of the output file name ' 'to distinguish the different files); note that, by default, ' 'collapsed hours played values will be used (if this is not ' 'desired, use the --use_original_hours_values flag).', action='store_true', default=False) parser_add_argument('--nbins', help='Specify the number of bins in which to collapse hours played ' 'values; to be used if the --make_train_test_sets flag is not ' 'being used, in which case pre-computed hours played values will' ' not be read in from the database, but you still want the ' 'values to be in the form of bins (i.e., 1 for 0-100, 2 for ' '101-200, etc., depending on the minimum and maximum values and ' 'the number of bins specified).', type=int, required=False) parser_add_argument('--bin_factor', help='Factor by which to multiply the sizes of the bins, such that ' 'the bins with lots of values will be smaller and the more ' 'sparsely-populated bins will be smaller in terms of range.', type=float, default=1.0) parser_add_argument('-dbhost', '--mongodb_host', help='Host that the MongoDB server is running on.', type=str, default='localhost') parser_add_argument('--mongodb_port', '-dbport', help='Port that the MongoDB server is running on.', type=int, default=27017) parser_add_argument('--log_file_path', '-log', help='Path for log file.', type=str, default=join(project_dir, 'logs', 'replog_make_arff.txt')) args = parser.parse_args() # Imports import os import logging from re import sub from sys import exit from util.mongodb import connect_to_db from util.datasets import (get_game_files, get_bin_ranges, write_arff_file, get_and_describe_dataset) # Make local copies of arguments game_files = args.game_files output_dir = args.output_dir mode = args.mode combined_file_prefix = args.combined_file_prefix use_mongodb = args.use_mongodb nbins = args.nbins bins = not args.use_original_hours_values bin_factor = args.bin_factor mongodb_host = args.mongodb_host mongodb_port = args.mongodb_port # Initialize logging system logging_info = logging.INFO logger = logging.getLogger('make_arff_files') logger.setLevel(logging_info) # Create file handler fh = logging.FileHandler(abspath(args.log_file_path)) fh.setLevel(logging_info) # Create console handler sh = logging.StreamHandler() sh.setLevel(logging_info) # Add nicer formatting formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s -' ' %(message)s') fh.setFormatter(formatter) sh.setFormatter(formatter) logger.addHandler(fh) logger.addHandler(sh) loginfo = logger.info logerror = logger.error logwarn = logger.warning # Check if the output directory exists output_dir = abspath(output_dir) if not (exists(output_dir) and isdir(output_dir)): logerror('The given output directory, {}, for ARFF files does not ' 'exist or is not a directory. Exiting.'.format(output_dir)) exit(1) # Make sure --bins option flag makes sense if nbins: if use_mongodb: logerror('If the --use_mongodb flag is used, a number of bins in ' 'which to collapse the hours played values cannot be ' 'specified (since the values in the database were ' 'pre-computed). Exiting.') exit(1) elif not bins: logerror('Conflict between the --use_original_hours_values and ' '--nbins flags. Both cannot be used at the same time.') exit(1) elif (bins and not use_mongodb): loginfo('If both the --use_original_hours_values and --use_mongodb ' 'flags are not used, then the number of bins in which to ' 'collapse the hours played values must be specified via the ' '--nbins option argument. Exiting.') exit(1) # Exit if the --bin_factor argument was used despite the fact that # the original hours values are not being binned if (not bins and bin_factor > 1.0): logerror('The --bin_factor argument was specified despite the fact ' 'that the original hours values are being binned. Exiting.') exit(1) # Get path to the data directory data_dir = join(project_dir, 'data') if bins: arff_files_dir = join(output_dir, 'arff_files_collapsed_values') else: arff_files_dir = join(output_dir, 'arff_files_original_values') loginfo('data directory: {}'.format(data_dir)) loginfo('arff files directory: {}'.format(output_dir)) # Make sure there is a combined output file prefix if "combine" is # the value passed in via --mode if (mode == 'combined' and not combined_file_prefix): logerror('A combined output file prefix must be specified in cases ' 'where the "combined" value was passed in via the --mode ' 'option flag (or --mode was not specified, in which case ' '"combined" is the default value). Exiting.') exit(1) ''' See if the --use_mongodb flag was used, in which case we have to make a connection to the MongoDB collection. And, if it wasn't used, then print out warning if the --mongodb_port flag was used (since it will be ignored) unless the value is equal to the default value (since it probably wasn't specified in that case). ''' if use_mongodb: loginfo('Connecting to MongoDB database on mongodb://{}:{}...' .format(mongodb_host, mongodb_port)) reviewdb = connect_to_db(host=mongodb_host, port=mongodb_port) elif (mongodb_port and not mongodb_port == 27017): logwarn('Ignoring argument passed in via the --mongodb_port/-dbport ' 'option flag since the --use_mongodb flag was not also used, ' 'which means that the MongoDB database is not going to be ' 'used.') game_files = get_game_files(game_files, join(dirname(dirname(__file__)), 'data')) if len(game_files) == 1: # Print out warning message if --mode was set to "combined" and # there was only one file n the list of game files since only a # single ARFF file will be created if mode == 'combined': logwarn('The --mode flag was used with the value "combined" (or ' 'was unspecified) even though only one game file was ' 'passed in via the --game_files flag. Only one file will ' 'be written and it will be named after the game.') mode = "separate" # Make a list of dicts corresponding to each review and write .arff # files loginfo('Reading in data from reviews files...') if mode == "combined": review_dicts_list = [] if not use_mongodb: # Min/max values of hours played (i.e., game experience) if bins: minh = 0.0 maxh = 0.0 for game_file in game_files: loginfo('Getting review data from {}...'.format(game_file)) dataset = get_and_describe_dataset(join(data_dir, game_file), report=False) review_dicts_list.extend(dataset['reviews']) # If the hours played values are to be divided into # bins, update the min/max values if bins: if dataset['minh'] < minh: minh = dataset['minh'] if dataset['max'] > maxh: maxh = dataset['maxh'] # If the hours played values are to be divided into bins, # get the range that each bin maps to if bins: bin_ranges = get_bin_ranges(minh, maxh, nbins, bin_factor) else: bin_ranges = False file_names = [splitext(game)[0] for game in game_files] arff_file = join(arff_files_dir, '{}.arff'.format(combined_file_prefix)) if use_mongodb: loginfo('Generating ARFF files for the combined training sets and' ' the combined test sets, respectively, of the following ' 'games:\n\n{}'.format(', '.join([sub(r'_', r' ', fname) for fname in file_names]))) write_arff_file(arff_file, file_names, reviewdb=reviewdb, make_train_test=True, bins=True) else: loginfo('Generating {}...'.format(arff_file)) write_arff_file(arff_file, file_names, reviews=review_dicts_list, bins=bin_ranges) else: for game_file in game_files: loginfo('Getting review data from {}...'.format(game_file)) if not use_mongodb: review_dicts_list = [] dataset = get_and_describe_dataset(join(data_dir, game_file), report=False) review_dicts_list.extend(dataset['reviews']) if bins: # Get min/max hours played values from results of # get_and_describe_dataset() call minh = dataset['minh'] maxh = dataset['maxh'] # Get the range that each bin maps to bin_ranges = get_bin_ranges(minh, maxh, nbins, bin_factor) else: bin_ranges = False game = splitext(game_file)[0] arff_file = join(arff_files_dir, '{}.arff'.format(game)) if use_mongodb: loginfo('Generating ARFF file for the training and test sets ' 'for {}...'.format(game)) write_arff_file(arff_file, [game], reviewdb=reviewdb, make_train_test=True, bins=bins) else: loginfo('Generating {}...'.format(arff_file)) write_arff_file(arff_file, [game], reviews=review_dicts_list, bins=bin_ranges) loginfo('Complete.')