Python connect_to_db示例

编程语言: Python

命名空间/包名称: src.mongodb

方法/功能: connect_to_db

hotexamples.com的示例: 6

Python connect_to_db - 已找到6个示例。这些是从开源项目中提取的最受好评的src.mongodb.connect_to_db现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

 def setUp(self):
     try:
         self.db = connect_to_db('localhost', 37017)
     except AutoReconnect as e:
         raise ConnectionFailure('Could not connect to MongoDB client. Make'
                                 ' sure a tunnel is set up (or some other '
                                 'method is used) before running the '
                                 'tests.')

示例#2

显示文件

 def setUp(self):
     try:
         self.db = connect_to_db('localhost', 37017)
     except AutoReconnect as e:
         raise ConnectionFailure('Could not connect to MongoDB client. Make'
                                 'sure a tunnel is set up (or some other '
                                 'method is used) before running the '
                                 'tests.')
     self.prediction_label = 'total_game_hours'
     self.output_path = join(this_dir, 'test_output')
     if exists(self.output_path):
         rmtree(self.output_path)
     makedirs(self.output_path)

示例#3

显示文件

def main(argv=None):
    parser = ArgumentParser(description='Run incremental learning '
                                        'experiments.',
                            formatter_class=ArgumentDefaultsHelpFormatter,
                            conflict_handler='resolve')
    _add_arg = parser.add_argument
    _add_arg('-dbhost', '--mongodb_host',
             help='Host that the MongoDB server is running on.',
             type=str,
             default='localhost')
    _add_arg('--mongodb_port', '-dbport',
             help='Port that the MongoDB server is running on.',
             type=int,
             default=37017)
    args = parser.parse_args()

    # Imports
    import sys

    from pymongo import ASCENDING
    from pymongo.errors import ConnectionFailure

    from src.mongodb import connect_to_db

    # Connect to MongoDB database
    logger.info('Connecting to MongoDB database at {0}:{1}...'
                .format(args.mongodb_host, args.mongodb_port))
    try:
        db = connect_to_db(args.mongodb_host, args.mongodb_port)
    except ConnectionFailure as e:
        logger.error('Failed to connect to the MongoDB database collection.')
        raise e

    # Create index on 'steam_id_number' so that cursors can be sorted
    # on that particular key
    logger.info('Creating index on the "steam_id_number" key.')
    db.create_index('steam_id_number', ASCENDING)
    logger.info('Created new index named "steam_id_number_1" in the "reviews" '
                'collection.')

示例#4

显示文件

def main():
    parser = ArgumentParser(
        usage='python extract_features.py --game_files '
        'GAME_FILE1,GAME_FILE2,...[ OPTIONS]',
        description='Extract features and add them to the Mongo database.',
        formatter_class=ArgumentDefaultsHelpFormatter)
    _add_arg = parser.add_argument
    _add_arg('--game_files',
             help='Comma-separated list of file-names or "all" for all of the'
             ' files (the game files should reside in the "data" '
             'directory; the .jsonlines suffix is not necessary, but the'
             ' file-names should be exact matches otherwise).',
             type=str,
             required=True)
    _add_arg('--do_not_binarize_features',
             help='Do not make all non-zero feature frequencies equal to 1.',
             action='store_true',
             default=False)
    _add_arg('--do_not_lowercase_text',
             help='Do not make lower-casing part of the review text '
             'normalization step, which affects word n-gram-related '
             'features.',
             action='store_true',
             default=False)
    _add_arg('--lowercase_cngrams',
             help='Lower-case the review text before extracting character '
             'n-gram features.',
             action='store_true',
             default=False)
    _add_arg('--partition',
             help='Data partition, i.e., "training", "test", etc. Value must '
             'be a valid partition set name in the Mongo database. '
             'Alternatively, the value "all" can be used to include all '
             'partitions.',
             type=str,
             default='all')
    _add_arg('--do_not_reuse_extracted_features',
             help="Don't make use of previously-extracted features present in"
             " the Mongo database and instead replace them if they are.",
             action='store_true',
             default=False)
    _add_arg('-dbhost',
             '--mongodb_host',
             help='Host that the MongoDB server is running on.',
             type=str,
             default='localhost')
    _add_arg('-dbport',
             '--mongodb_port',
             help='Port that the MongoDB server is running on.',
             type=int,
             default=27017)
    _add_arg('--update_batch_size',
             '-batch_size',
             help='Size of each batch for the bulk updates.',
             type=int,
             default=100)
    _add_arg('-log',
             '--log_file_path',
             help='Path to feature extraction log file.',
             type=str,
             default=join(log_dir, 'replog_extract_features.txt'))
    args = parser.parse_args()

    # Imports
    from pymongo.errors import (BulkWriteError, ConnectionFailure)
    from src import (get_game_files, log_format_string)
    from src.mongodb import (connect_to_db,
                             bulk_extract_features_and_update_db)

    # Make local copies of arguments
    game_files = args.game_files
    binarize = not args.do_not_binarize_features
    reuse_features = not args.do_not_reuse_extracted_features
    lowercase_text = not args.do_not_lowercase_text
    lowercase_cngrams = args.lowercase_cngrams
    partition = args.partition
    mongodb_host = args.mongodb_host
    mongodb_port = args.mongodb_port
    update_batch_size = args.update_batch_size
    if update_batch_size < 1:
        raise ValueError('--update_batch_size/-batch_size should be greater '
                         'than 0.')

    # Make sure log file directory exists
    log_file_path = realpath(args.log_file_path)
    log_file_dir = dirname(log_file_path)
    if not exists(log_file_dir):
        makedirs(log_file_dir, exist_ok=True)

    # Setup file handler
    fh = logging.FileHandler(log_file_path)
    fh.setLevel(logging_debug)
    fh.setFormatter(formatter)
    logger.addHandler(fh)

    # Print out some logging information about the upcoming tasks
    logdebug('Project directory: {0}'.format(project_dir))
    logdebug('Binarize features? {0}'.format(binarize))
    logdebug('Try to reuse previously-extracted features in the database? {0}'.
             format(reuse_features))
    logdebug('Lower-case text as part of the normalization step? {0}'.format(
        lowercase_text))
    logdebug(
        'Lower-case character n-grams during feature extraction? {0}'.format(
            lowercase_cngrams))
    logdebug('Batch size for database updates: {0}'.format(update_batch_size))

    # Establish connection to MongoDB database collection
    loginfo('Connecting to MongoDB database on mongodb://{0}:{1}...'.format(
        mongodb_host, mongodb_port))
    try:
        db = connect_to_db(host=mongodb_host, port=mongodb_port)
    except ConnectionFailure as e:
        logerr('Unable to connect to MongoDB reviews collection.')
        logerr(e)
        raise e
    db.write_concern['w'] = 0

    # Get list of games
    game_files = get_game_files(game_files)

    # Iterate over the game files, extracting and adding/replacing
    # features to the database
    for game_file in game_files:
        game = splitext(game_file)[0]
        if partition == 'all':
            partition_string = ' from the "training" and "test" data partitions'
        else:
            partition_string = ' from the "{0}" data partition'.format(
                partition)
        loginfo('Extracting features{0} for {1}...'.format(
            partition_string, game))
        try:
            updates = \
                bulk_extract_features_and_update_db(db,
                                                    game,
                                                    partition,
                                                    reuse_nlp_feats=reuse_features,
                                                    use_binarized_nlp_feats=binarize,
                                                    lowercase_text=lowercase_text,
                                                    lowercase_cngrams=lowercase_cngrams,
                                                    update_batch_size=update_batch_size)
        except BulkWriteError as bwe:
            logerr('Encountered a BulkWriteError while executing the call to '
                   '`bulk_extract_features_and_update_db`.')
            raise bwe
    if updates:
        loginfo(
            '{0} updates were made to the reviews collection.'.format(updates))
    else:
        raise ValueError('No updates were made.')

示例#5

显示文件

def main():
    parser = ArgumentParser(usage='python make_arff_files.py --game_files '
                                  'GAME_FILE1,GAME_FILE2[ OPTIONS]',
                            description='Build .arff files for a specific '
                                        'game file, all game files combined, '
                                        'or for each game file separately.',
        formatter_class=ArgumentDefaultsHelpFormatter)
    _add_arg = parser.add_argument
    _add_arg('--game_files',
             help='Comma-separated list of file-names or "all" for all of the'
                  ' files (the game files should reside in the "data" '
                  'directory; the .jsonlines suffix is not necessary, but the'
                  ' file-names should be exact matches otherwise).',
             type=str,
             required=True)
    _add_arg('--output_dir', '-o',
             help='Destination directory for ARFF files.',
             type=str,
             required=True)
    _add_arg('--mode',
             help='Make .arff file for each game file separately ("separate")'
                  ' or for all game files combined ("combined").',
             choices=["separate", "combined"],
             default="combined")
    _add_arg('--combined_file_prefix',
             help='If the "combined" value was passed in via the --mode flag '
                  '(which happens by default unless specified otherwise), an '
                  'output file prefix must be passed in via this option '
                  'flag.',
             type=str,
             required=False)
    _add_arg('--use_original_hours_values',
             help='Use the unmodified hours played values; otherwise, use the'
                  ' collapsed values.',
             action='store_true',
             default=False)
    _add_arg('--use_mongodb',
             help='Search the MongoDB collection for training/test set '
                  'reviews and make ARFF files using them only (the file '
                  'suffix ".train"/".test" will be appended onto the end of '
                  'the output file name to distinguish the different files); '
                  'note that, by default, collapsed hours played values will '
                  'be used (if this is not desired, use the '
                  '--use_original_hours_values flag).',
             action='store_true',
             default=False)
    _add_arg('--nbins',
             help='Specify the number of bins in which to collapse hours '
                  'played values; to be used if the --make_train_test_sets '
                  'flag is not being used, in which case pre-computed hours '
                  'played values will not be read in from the database, but '
                  'you still want the values to be in the form of bins (i.e.,'
                  ' 1 for 0-100, 2 for 101-200, etc., depending on the '
                  'minimum and maximum values and the number of bins '
                  'specified).',
             type=int,
             required=False)
    _add_arg('--bin_factor',
             help='Factor by which to multiply the sizes of the bins, such '
                  'that the bins with lots of values will be smaller and the '
                  'more sparsely-populated bins will be smaller in terms of '
                  'range.',
             type=float,
             default=1.0)
    _add_arg('-dbhost', '--mongodb_host',
             help='Host that the MongoDB server is running on.',
             type=str,
             default='localhost')
    _add_arg('--mongodb_port', '-dbport',
             help='Port that the MongoDB server is running on.',
             type=int,
             default=27017)
    _add_arg('--log_file_path', '-log',
             help='Path for log file.',
             type=str,
             default=join(log_dir, 'replog_make_arff.txt'))
    args = parser.parse_args()

    # Imports
    import os
    from re import sub
    import numpy as np
    from src.mongodb import (connect_to_db,
                             get_game_files)
    from src.datasets import (get_bin_ranges,
                              write_arff_file,
                              get_and_describe_dataset)

    # Make local copies of arguments
    game_files = args.game_files
    output_dir = args.output_dir
    mode = args.mode
    combined_file_prefix = args.combined_file_prefix
    use_mongodb = args.use_mongodb
    nbins = args.nbins
    bins = not args.use_original_hours_values
    bin_factor = args.bin_factor
    mongodb_host = args.mongodb_host
    mongodb_port = args.mongodb_port

    # Make sure log file directory exists
    log_file_path = realpath(args.log_file_path)
    log_file_dir = dirname(log_file_path)
    if not exists(log_file_dir):
        makedirs(log_file_dir, exist_ok=True)

    # Make file handler
    fh = logging.FileHandler(log_file_path)
    fh.setLevel(logging_info)
    fh.setFormatter(formatter)
    logger.addHandler(fh)

    # Check if the output directory exists
    output_dir = realpath(output_dir)
    if not exists(output_dir) and isdir(output_dir):
        msg = ('The given output directory, {0}, for ARFF files does not '
               'exist or is not a directory.'.format(output_dir))
        logerr(msg)
        raise ValueError(msg)

    # Make sure --bins option flag makes sense
    if nbins:
        if use_mongodb:
            msg = ('If the --use_mongodb flag is used, a number of bins in '
                   'which to collapse the hours played values cannot be '
                   'specified (since the values in the database were '
                   'pre-computed).')
            logerr(msg)
            raise ValueError(msg)
        elif not bins:
            msg = ('Conflict between the --use_original_hours_values and '
                   '--nbins flags. Both cannot be used at the same time.')
            logerr(msg)
            raise ValueError(msg)
    elif bins and not use_mongodb:
        msg = ('If both the --use_original_hours_values and --use_mongodb '
               'flags are not used, then the number of bins in which to '
               'collapse the hours played values must be specified via the '
               '--nbins option argument.')
        loginfo(msg)
        raise ValueError(msg)

    # Exit if the --bin_factor argument was used despite the fact that
    # the original hours values are not being binned
    if not bins and bin_factor > 1.0:
        msg = ('The --bin_factor argument was specified despite the fact '
               'that the original hours values are being binned.')
        logerr(msg)
        raise ValueError(msg)

    # Get path to the data directory
    if bins:
        arff_files_dir = join(output_dir, 'arff_files_collapsed_values')
    else:
        arff_files_dir = join(output_dir, 'arff_files_original_values')
    loginfo('data directory: {0}'.format(data_dir))
    loginfo('arff files directory: {0}'.format(output_dir))

    # Make sure there is a combined output file prefix if "combine" is
    # the value passed in via --mode
    if mode == 'combined' and not combined_file_prefix:
        msg = ('A combined output file prefix must be specified in cases '
               'where the "combined" value was passed in via the --mode '
               'option flag (or --mode was not specified, in which case '
               '"combined" is the default value).')
        logerr(msg)
        raise ValueError(msg)

    """
    See if the --use_mongodb flag was used, in which case we have to
    make a connection to the MongoDB collection. And, if it wasn't
    used, then print out warning if the --mongodb_port flag was used
    (since it will be ignored) unless the value is equal to the default
    value (since it probably wasn't specified in that case).
    """
    if use_mongodb:
        loginfo('Connecting to MongoDB database on mongodb://{0}:{1}...'
                .format(mongodb_host, mongodb_port))
        reviewdb = connect_to_db(host=mongodb_host, port=mongodb_port)
    elif (mongodb_port
          and not mongodb_port == 27017):
        logwarn('Ignoring argument passed in via the --mongodb_port/-dbport '
                'option flag since the --use_mongodb flag was not also used, '
                'which means that the MongoDB database is not going to be '
                'used.')

    game_files = get_game_files(game_files)
    if len(game_files) == 1:

        # Print out warning message if --mode was set to "combined" and
        # there was only one file n the list of game files since only a
        # single ARFF file will be created
        if mode == 'combined':
            logwarn('The --mode flag was used with the value "combined" (or '
                    'was unspecified) even though only one game file was '
                    'passed in via the --game_files flag. Only one file will '
                    'be written and it will be named after the game.')
        mode = "separate"

    # Make a list of dicts corresponding to each review and write .arff
    # files
    loginfo('Reading in data from reviews files...')
    if mode == "combined":
        review_dicts_list = []
        if not use_mongodb:
            for game_file in game_files:
                loginfo('Getting review data from {0}...'.format(game_file))
                (review_dicts_list
                 .extend(get_and_describe_dataset(join(data_dir, game_file),
                                                  report=False)))

            # If the hours played values are to be divided into bins,
            # get the range that each bin maps to
            if bins:
                bin_ranges = get_bin_ranges(min([r['total_game_hours'] for r
                                                 in review_dicts_list]),
                                            max([r['total_game_hours'] for r
                                                 in review_dicts_list]),
                                            nbins,
                                            bin_factor)
            else:
                bin_ranges = False
        file_names = [splitext(game)[0] for game in game_files]
        arff_file = join(arff_files_dir, '{0}.arff'.format(combined_file_prefix))
        if use_mongodb:
            loginfo('Generating ARFF files for the combined training sets and'
                    ' the combined test sets, respectively, of the following '
                    'games:\n\n{0}'
                    .format(', '.join([sub(r'_', r' ', fname)
                                       for fname in file_names])))
            write_arff_file(arff_file, file_names, reviewdb=reviewdb,
                            make_train_test=True, bins=True)
        else:
            loginfo('Generating {0}...'.format(arff_file))
            write_arff_file(arff_file,
                            file_names,
                            reviews=review_dicts_list,
                            bins=bin_ranges)
    else:
        for game_file in game_files:
            loginfo('Getting review data from {0}...'.format(game_file))
            if not use_mongodb:
                review_dicts_list = get_and_describe_dataset(join(data_dir,
                                                                  game_file),
                                                             report=False)
                if bins:
                    bin_ranges = get_bin_ranges(min([r['total_game_hours'] for r
                                                     in review_dicts_list]),
                                                max([r['total_game_hours'] for r
                                                     in review_dicts_list]),
                                                nbins,
                                                bin_factor)
                else:
                    bin_ranges = False
            game = splitext(game_file)[0]
            arff_file = join(arff_files_dir, '{0}.arff'.format(game))
            if use_mongodb:
                loginfo('Generating ARFF file for the training and test sets '
                        'for {0}...'.format(game))
                write_arff_file(arff_file,
                                [game],
                                reviewdb=reviewdb,
                                make_train_test=True,
                                bins=bins)
            else:
                loginfo('Generating {0}...'.format(arff_file))
                write_arff_file(arff_file,
                                [game],
                                reviews=review_dicts_list,
                                bins=bin_ranges)
    loginfo('Complete.')

示例#6

显示文件

def main():
    parser = \
        ArgumentParser(usage='python make_train_test_sets.py --game_files '
                             'GAME_FILE1,GAME_FILE2,...[ OPTIONS]',
                       description='Build train/test sets for each game. Take'
                                   ' up to 21k reviews and split it 80/20 '
                                   'training/test, respectively, by default. '
                                   'Both the maximum size and the percentage '
                                   'split can be altered via command-line '
                                   'flags. All selected reviews will be put '
                                   'into the "reviews_project" database\'s '
                                   '"reviews" collection (which is being '
                                   ' hosted on lemur.montclair.edu on port '
                                   '27017).',
        formatter_class=ArgumentDefaultsHelpFormatter)
    _add_arg = parser.add_argument
    _add_arg('--game_files',
             help='Comma-separated list of file-names or "all" for all of the'
                  ' files (the game files should reside in the "data" '
                  'directory).',
             type=str,
             required=True)
    _add_arg('--max_size', '-m',
             help='Maximum number of reviews to get for training/testing (if '
                  'possible).',
             type=int,
             default=4000)
    _add_arg('--percent_train', '-%',
             help='Percent of selected reviews for which to use for the '
                  'training set, the rest going to the test set.',
             type=float,
             default=80.0)
    _add_arg('--convert_to_bins', '-bins',
             help='Number of sub-divisions of the hours-played values, e.g. '
                  'if 10 and the hours values range from 0 up to 1000, then '
                  'hours values 0-99 will become 1, 100-199 will become 2, '
                  'etc. (will probably be necessay to train a model that '
                  'actually is predictive to an acceptable degree); note that'
                  ' both hours values will be retained, the original under '
                  'the name "hours" and the converted value under the name '
                  '"hours_bin".',
             type=int,
             required=False)
    _add_arg('--bin_factor',
             help='If the --convert_to_bins/-bins argument is specified, '
                  'increase the sizes of the bins by the given factor so that'
                  ' bins in which there will be lots of instances will be '
                  'smaller in terms of range than bins that are more '
                  'sparsely-populated.',
             type=float,
             required=False)
    _add_arg('--make_reports', '-describe',
             help='Generate reports and histograms describing the data '
                  'filtering procedure.',
             action='store_true',
             default=False)
    _add_arg('--just_describe',
             help='Generate reports and histograms describing the data '
                  'filtering procedure, but then do NOT insert the reviews '
                  'into the MongoDB database.',
             action='store_true',
             default=False)
    _add_arg('--reports_dir',
             help='If -describe/--make_reports is used, put generated reports'
                  ' in the given directory.',
             type=str,
             required=False)
    _add_arg('-dbhost', '--mongodb_host',
             help='Host that the MongoDB server is running on.',
             type=str,
             default='localhost')
    _add_arg('--mongodb_port', '-dbport',
             help='Port that the MongoDB server is running on.',
             type=int,
             default=27017)
    _add_arg('--log_file_path', '-log',
             help='Path for log file.',
             type=str,
             default=join(log_dir, 'replog_make_train_test_sets.txt'))
    args = parser.parse_args()

    # Imports
    from os import listdir
    from pymongo import MongoClient
    from pymongo.errors import ConnectionFailure
    from src import get_game_files
    from src.mongodb import (connect_to_db,
                             insert_train_test_reviews)

    # Make local copies of arguments
    game_files = args.game_files
    max_size = args.max_size
    percent_train = args.percent_train
    convert_to_bins = args.convert_to_bins
    bin_factor = args.bin_factor
    make_reports = args.make_reports
    just_describe = args.just_describe
    reports_dir = args.reports_dir
    mongodb_host = args.mongodb_host
    mongodb_port = args.mongodb_port

    # Make sure log file directory exists
    log_file_path = realpath(args.log_file_path)
    log_file_dir = dirname(log_file_path)
    if not exists(log_file_dir):
        makedirs(log_file_dir, exist_ok=True)

    # Make file handler
    fh = logging.FileHandler(log_file_path)
    fh.setLevel(logging_info)
    fh.setFormatter(formatter)
    logger.addHandler(fh)

    # Make sure value passed in via the --convert_to_bins/-bins option
    # flag makes sense and, if so, assign value to variable bins (if
    # not, set bins equal to 0)
    if convert_to_bins and convert_to_bins < 2:
        msg = ('The value passed in via --convert_to_bins/-bins must be '
               'greater than one since there must be multiple bins to '
               'divide the hours played values.')
        logerr(msg)
        raise ValueError(msg)
    elif convert_to_bins:
        bins = convert_to_bins
    else:
        bins = 0

    # Make sure that, if the --bin_factor argument is specified, the
    # --convert_to_bins/-bins argument was also specified
    if bin_factor and not convert_to_bins:
        msg = ('The --bin_factor argument was specified despite the fact '
               'that the --convert_to_bins/-bins argument was not used.')
        logerr(msg)
        raise ValueError(msg)

    # Establish connection to MongoDB database
    loginfo('Connecting to MongoDB database on mongodb://{0}:{1}...'
            .format(mongodb_host, mongodb_port))
    try:
        reviewdb = connect_to_db(host=mongodb_host, port=mongodb_port)
    except ConnectionFailure as e:
        logerr('Unable to connect to MongoDB reviews collection.')
        logerr(e)
        raise e
    reviewdb.write_concern['w'] = 0

    # Get path to the directories
    if reports_dir:
        reports_dir = realpath(reports_dir)
        if isfile(reports_dir):
            msg = ('The file path passed in via the --reports_dir leads to a '
                   'file, not a directory.')
            logerr(msg)
            raise ValueError(msg)
        if not exists(reports_dir):
            makedirs(reports_dir, exist_ok=True)

    # Make sure args make sense
    if max_size < 50:
        msg = ('You can\'t be serious, right? You passed in a value of 50 for'
               ' the MAXIMUM size of the combination of training/test sets?')
        logerr(msg)
        raise ValueError(msg)
    if percent_train < 1.0:
        msg = ('You can\'t be serious, right? You passed in a value of 1.0%'
               ' for the percentage of the selected reviews that will be '
               'devoted to the training set? That is not going to be enough'
               ' training samples.')
        logerr(msg)
        raise ValueError(msg)

    # Make sense of arguments
    if (make_reports
        and just_describe):
        logwarn('If the --just_describe and -describe/--make_reports option '
                'flags are used, --just_describe wins out, i.e., reports will'
                ' be generated, but no reviews will be inserted into the '
                'database.')
    elif reports_dir and (make_reports or just_describe):
        if not exists(reports_dir):
            logerr('The given --reports_dir path was invalid. Exiting.')
            exit(1)

    # Get list of games
    game_files = get_game_files(game_files, join(dirname(dirname(__file__)), 'data'))

    loginfo('Adding training/test partitions to Mongo DB for the following '
            'games: {0}'
            .format(', '.join([splitext(game)[0] for game in game_files])))
    loginfo('Maximum size for the combined training/test sets: {0}'
            .format(max_size))
    loginfo('Percentage split between training and test sets: {0:.2f}/{1:.2f}'
            .format(percent_train, 100.0 - percent_train))
    if make_reports:
        loginfo('Generating reports in {0}.'
                .format(reports_dir if reports_dir else default_reports_dir))
    if just_describe:
        loginfo('Exiting after generating reports.')
    if bins:
        loginfo('Converting hours played values to {0} bins with a bin factor'
                ' of {1}.'.format(bins, bin_factor))

    # For each game in our list of games, we will read in the reviews
    # from the data file and then put entries in our MongoDB collection
    # with a key that identifies each review as either training or test
    for game_file in game_files:
        loginfo('Getting/inserting reviews for {}...'
                .format(splitext(basename(game_file))[0]))
        insert_train_test_reviews(reviewdb, realpath(join(data_dir, game_file)),
                                  max_size, percent_train, bins=bins,
                                  bin_factor=bin_factor, describe=make_reports,
                                  just_describe=just_describe,
                                  reports_dir=reports_dir if reports_dir
                                              else default_reports_dir)

    loginfo('Complete.')