示例#1
0
def make_ablation_data():
    # Remove old CV data
    for old_file in glob.glob(os.path.join(_my_dir, 'output',
                                           'ablation_cv_*.results')):
        os.remove(old_file)

    num_examples = 1000

    np.random.seed(1234567890)

    # Create lists we will write files from
    ids = []
    features = []
    classes = []
    for j in range(num_examples):
        y = "dog" if j % 2 == 0 else "cat"
        ex_id = "{}{}".format(y, j)
        x = {"f{}".format(feat_num): np.random.randint(0, 4) for feat_num in
             range(5)}
        x = OrderedDict(sorted(x.items(), key=lambda t: t[0]))
        ids.append(ex_id)
        classes.append(y)
        features.append(x)

    for i in range(5):
        train_path = os.path.join(_my_dir, 'train', 'f{}.jsonlines'.format(i))
        sub_features = []
        for example_num in range(num_examples):
            feat_num = i
            x = {"f{}".format(feat_num): features[example_num]["f{}".format(feat_num)]}
            sub_features.append(x)
        write_feature_file(train_path, ids, classes, sub_features)
示例#2
0
def make_sparse_data():
    # Create training file
    train_path = os.path.join(_my_dir, 'train', 'test_sparse.jsonlines')
    ids = []
    classes = []
    features = []
    for i in range(1, 101):
        y = "dog" if i % 2 == 0 else "cat"
        ex_id = "{}{}".format(y, i)
        # note that f1 and f5 are missing in all instances but f4 is not
        x = {"f2": i+1, "f3": i+2, "f4": i+5}
        ids.append(ex_id)
        classes.append(y)
        features.append(x)
    write_feature_file(train_path, ids, classes, features)

    # Create test file
    test_path = os.path.join(_my_dir, 'test', 'test_sparse.jsonlines')
    ids = []
    classes = []
    features = []
    for i in range(1, 51):
        y = "dog" if i % 2 == 0 else "cat"
        ex_id = "{}{}".format(y, i)
        # f1 and f5 are not missing in any instances here but f4 is
        x = {"f1": i, "f2": i+2, "f3": i % 10, "f5": i * 2}
        ids.append(ex_id)
        classes.append(y)
        features.append(x)
    write_feature_file(test_path, ids, classes, features)
示例#3
0
def make_class_map_data():
    # Create training file
    train_path = os.path.join(_my_dir, 'train', 'test_class_map.jsonlines')
    ids = []
    classes = []
    features = []
    class_names = ['beagle', 'cat', 'dachsund', 'cat']
    for i in range(1, 101):
        y = class_names[i % 4]
        ex_id = "{}{}".format(y, i)
        # note that f1 and f5 are missing in all instances but f4 is not
        x = {"f2": i+1, "f3": i+2, "f4": i+5}
        ids.append(ex_id)
        classes.append(y)
        features.append(x)
    write_feature_file(train_path, ids, classes, features)

    # Create test file
    test_path = os.path.join(_my_dir, 'test', 'test_class_map.jsonlines')
    ids = []
    classes = []
    features = []
    for i in range(1, 51):
        y = class_names[i % 4]
        ex_id = "{}{}".format(y, i)
        # f1 and f5 are not missing in any instances here but f4 is
        x = {"f1": i, "f2": i+2, "f3": i % 10, "f5": i * 2}
        ids.append(ex_id)
        classes.append(y)
        features.append(x)
    write_feature_file(test_path, ids, classes, features)
示例#4
0
def make_regression_data():
    num_examples = 2000
    num_train_examples = int(num_examples / 2)

    np.random.seed(1234567890)
    f1 = np.random.rand(num_examples)
    f2 = np.random.rand(num_examples)
    f3 = np.random.rand(num_examples)
    err = np.random.randn(num_examples) / 2.0
    y = 1.0 * f1 + 1.0 * f2 - 2.0 * f3 + err
    y = y.tolist()

    # Write training file
    train_dir = os.path.join(_my_dir, 'train')
    if not os.path.exists(train_dir):
        os.makedirs(train_dir)
    train_path = os.path.join(train_dir, 'test_regression1.jsonlines')
    features = [{"f1": f1[i], "f2": f2[i], "f3": f3[i]} for i in
                range(num_train_examples)]
    write_feature_file(train_path, None, y[:num_train_examples], features)

    # Write test file
    test_dir = os.path.join(_my_dir, 'test')
    if not os.path.exists(test_dir):
        os.makedirs(test_dir)
    test_path = os.path.join(test_dir, 'test_regression1.jsonlines')
    features = [{"f1": f1[i], "f2": f2[i], "f3": f3[i]} for i in
                range(num_train_examples, num_examples)]
    write_feature_file(test_path, None, y[num_train_examples: num_examples],
                       features)

    return y
示例#5
0
def make_summary_data():
    num_train_examples = 500
    num_test_examples = 100

    np.random.seed(1234567890)

    # Write training file
    train_path = os.path.join(_my_dir, 'train', 'test_summary.jsonlines')
    classes = []
    ids = []
    features = []
    for i in range(num_train_examples):
        y = "dog" if i % 2 == 0 else "cat"
        ex_id = "{}{}".format(y, i)
        x = {"f1": np.random.randint(1, 4), "f2": np.random.randint(1, 4),
             "f3": np.random.randint(1, 4)}
        classes.append(y)
        ids.append(ex_id)
        features.append(x)
    write_feature_file(train_path, ids, classes, features)

    # Write test file
    test_path = os.path.join(_my_dir, 'test', 'test_summary.jsonlines')
    classes = []
    ids = []
    features = []
    for i in range(num_test_examples):
        y = "dog" if i % 2 == 0 else "cat"
        ex_id = "{}{}".format(y, i)
        x = {"f1": np.random.randint(1, 4), "f2": np.random.randint(1, 4),
             "f3": np.random.randint(1, 4)}
        classes.append(y)
        ids.append(ex_id)
        features.append(x)
    write_feature_file(test_path, ids, classes, features)
示例#6
0
def make_scaling_data():
    num_train_examples = 1000
    num_test_examples = 100

    np.random.seed(1234567890)

    # create training data
    ids = []
    features = []
    classes = []
    for j in range(num_train_examples):
        y = "dog" if j % 2 == 0 else "cat"
        ex_id = "{}{}".format(y, j)
        x = {
            "g{}".format(feat_num): np.random.randint(0, 4)
            for feat_num in range(5)
        }
        x = OrderedDict(sorted(x.items(), key=lambda t: t[0]))
        ids.append(ex_id)
        classes.append(y)
        features.append(x)

    for i in range(5):
        train_path = os.path.join(_my_dir, 'train', 'g{}.jsonlines'.format(i))
        sub_features = []
        for example_num in range(num_train_examples):
            feat_num = i
            x = {
                "g{}".format(feat_num):
                features[example_num]["g{}".format(feat_num)]
            }
            sub_features.append(x)
        write_feature_file(train_path, ids, classes, sub_features)

    # create the test data
    for j in range(num_test_examples):
        y = "dog" if j % 2 == 0 else "cat"
        ex_id = "{}{}".format(y, j)
        x = {
            "g{}".format(feat_num): np.random.randint(0, 4)
            for feat_num in range(5)
        }
        x = OrderedDict(sorted(x.items(), key=lambda t: t[0]))
        ids.append(ex_id)
        classes.append(y)
        features.append(x)

    for i in range(5):
        train_path = os.path.join(_my_dir, 'test', 'g{}.jsonlines'.format(i))
        sub_features = []
        for example_num in range(num_test_examples):
            feat_num = i
            x = {
                "g{}".format(feat_num):
                features[example_num]["g{}".format(feat_num)]
            }
            sub_features.append(x)
        write_feature_file(train_path, ids, classes, sub_features)
示例#7
0
def make_conversion_data(num_feat_files, from_suffix, to_suffix):
    num_examples = 500
    num_feats_per_file = 7

    np.random.seed(1234567890)

    convert_dir = os.path.join(_my_dir, 'train', 'test_conversion')
    if not os.path.exists(convert_dir):
        os.makedirs(convert_dir)

    # Create lists we will write files from
    ids = []
    features = []
    classes = []
    for j in range(num_examples):
        y = "dog" if j % 2 == 0 else "cat"
        ex_id = "{}{}".format(y, j)
        x = {
            "f{:03d}".format(feat_num): np.random.randint(0, 4)
            for feat_num in range(num_feat_files * num_feats_per_file)
        }
        x = OrderedDict(sorted(x.items(), key=lambda t: t[0]))
        ids.append(ex_id)
        classes.append(y)
        features.append(x)

    # get the feature name prefix
    feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'),
                                            to_suffix.lstrip('.'))

    # Write out unmerged features in the `from_suffix` file format
    for i in range(num_feat_files):
        train_path = os.path.join(
            convert_dir, '{}_{}{}'.format(feature_name_prefix, i, from_suffix))
        sub_features = []
        for example_num in range(num_examples):
            feat_num = i * num_feats_per_file
            x = {
                "f{:03d}".format(feat_num + j):
                features[example_num]["f{:03d}".format(feat_num + j)]
                for j in range(num_feats_per_file)
            }
            sub_features.append(x)
        write_feature_file(train_path, ids, classes, sub_features)

    # Write out the merged features in the `to_suffix` file format
    train_path = os.path.join(
        convert_dir, '{}_all{}'.format(feature_name_prefix, to_suffix))
    write_feature_file(train_path, ids, classes, features)
示例#8
0
def make_scaling_data():
    num_train_examples = 1000
    num_test_examples = 100

    np.random.seed(1234567890)

    # create training data
    ids = []
    features = []
    classes = []
    for j in range(num_train_examples):
        y = "dog" if j % 2 == 0 else "cat"
        ex_id = "{}{}".format(y, j)
        x = {"g{}".format(feat_num): np.random.randint(0, 4) for feat_num in
             range(5)}
        x = OrderedDict(sorted(x.items(), key=lambda t: t[0]))
        ids.append(ex_id)
        classes.append(y)
        features.append(x)

    for i in range(5):
        train_path = os.path.join(_my_dir, 'train', 'g{}.jsonlines'.format(i))
        sub_features = []
        for example_num in range(num_train_examples):
            feat_num = i
            x = {"g{}".format(feat_num): features[example_num]["g{}".format(feat_num)]}
            sub_features.append(x)
        write_feature_file(train_path, ids, classes, sub_features)

    # create the test data
    for j in range(num_test_examples):
        y = "dog" if j % 2 == 0 else "cat"
        ex_id = "{}{}".format(y, j)
        x = {"g{}".format(feat_num): np.random.randint(0, 4) for feat_num in
             range(5)}
        x = OrderedDict(sorted(x.items(), key=lambda t: t[0]))
        ids.append(ex_id)
        classes.append(y)
        features.append(x)

    for i in range(5):
        train_path = os.path.join(_my_dir, 'test', 'g{}.jsonlines'.format(i))
        sub_features = []
        for example_num in range(num_test_examples):
            feat_num = i
            x = {"g{}".format(feat_num): features[example_num]["g{}".format(feat_num)]}
            sub_features.append(x)
        write_feature_file(train_path, ids, classes, sub_features)
示例#9
0
def make_conversion_data(num_feat_files, from_suffix, to_suffix):
    num_examples = 500
    num_feats_per_file = 7

    np.random.seed(1234567890)

    convert_dir = os.path.join(_my_dir, 'train', 'test_conversion')
    if not os.path.exists(convert_dir):
        os.makedirs(convert_dir)

    # Create lists we will write files from
    ids = []
    features = []
    classes = []
    for j in range(num_examples):
        y = "dog" if j % 2 == 0 else "cat"
        ex_id = "{}{}".format(y, j)
        x = {"f{:03d}".format(feat_num): np.random.randint(0, 4) for feat_num in
             range(num_feat_files * num_feats_per_file)}
        x = OrderedDict(sorted(x.items(), key=lambda t: t[0]))
        ids.append(ex_id)
        classes.append(y)
        features.append(x)

    # get the feature name prefix
    feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'), to_suffix.lstrip('.'))

    # Write out unmerged features in the `from_suffix` file format
    for i in range(num_feat_files):
        train_path = os.path.join(convert_dir,
                                  '{}_{}{}'.format(feature_name_prefix,
                                                   i, from_suffix))
        sub_features = []
        for example_num in range(num_examples):
            feat_num = i * num_feats_per_file
            x = {"f{:03d}".format(feat_num + j): features[example_num]["f{:03d}".format(feat_num + j)] for j in range(num_feats_per_file)}
            sub_features.append(x)
        write_feature_file(train_path, ids, classes, sub_features)

    # Write out the merged features in the `to_suffix` file format
    train_path = os.path.join(convert_dir, '{}_all{}'.format(feature_name_prefix,
                                                             to_suffix))
    write_feature_file(train_path, ids, classes, features)
示例#10
0
def make_merging_data(num_feat_files, suffix, numeric_ids):
    num_examples = 500
    num_feats_per_file = 17

    np.random.seed(1234567890)

    merge_dir = os.path.join(_my_dir, 'train', 'test_merging')
    if not os.path.exists(merge_dir):
        os.makedirs(merge_dir)

    # Create lists we will write files from
    ids = []
    features = []
    classes = []
    for j in range(num_examples):
        y = "dog" if j % 2 == 0 else "cat"
        ex_id = "{}{}".format(y, j) if not numeric_ids else j
        x = {
            "f{:03d}".format(feat_num): np.random.randint(0, 4)
            for feat_num in range(num_feat_files * num_feats_per_file)
        }
        x = OrderedDict(sorted(x.items(), key=lambda t: t[0]))
        ids.append(ex_id)
        classes.append(y)
        features.append(x)

    # Unmerged
    subset_dict = {}
    for i in range(num_feat_files):
        feat_num = i * num_feats_per_file
        subset_dict['{}'.format(i)] = [
            "f{:03d}".format(feat_num + j) for j in range(num_feats_per_file)
        ]
    train_path = os.path.join(merge_dir, suffix)
    write_feature_file(train_path, ids, classes, features, subsets=subset_dict)

    # Merged
    train_path = os.path.join(merge_dir, 'all{}'.format(suffix))
    write_feature_file(train_path, ids, classes, features)
示例#11
0
def make_merging_data(num_feat_files, suffix, numeric_ids):
    num_examples = 500
    num_feats_per_file = 17

    np.random.seed(1234567890)

    merge_dir = os.path.join(_my_dir, 'train', 'test_merging')
    if not os.path.exists(merge_dir):
        os.makedirs(merge_dir)

    # Create lists we will write files from
    ids = []
    features = []
    classes = []
    for j in range(num_examples):
        y = "dog" if j % 2 == 0 else "cat"
        ex_id = "{}{}".format(y, j) if not numeric_ids else j
        x = {"f{:03d}".format(feat_num): np.random.randint(0, 4) for feat_num in
             range(num_feat_files * num_feats_per_file)}
        x = OrderedDict(sorted(x.items(), key=lambda t: t[0]))
        ids.append(ex_id)
        classes.append(y)
        features.append(x)

    # Unmerged
    subset_dict = {}
    for i in range(num_feat_files):
        feat_num = i * num_feats_per_file
        subset_dict['{}'.format(i)] = ["f{:03d}".format(feat_num + j) for j in
                                       range(num_feats_per_file)]
    train_path = os.path.join(merge_dir, suffix)
    write_feature_file(train_path, ids, classes, features, subsets=subset_dict)

    # Merged
    train_path = os.path.join(merge_dir, 'all{}'.format(suffix))
    write_feature_file(train_path, ids, classes, features)
示例#12
0
def main(argv=None):
    '''
    Handles command line arguments and gets things started.

    :param argv: List of arguments, as if specified on the command-line.
                 If None, ``sys.argv[1:]`` is used instead.
    :type argv: list of str
    '''
    # Get command line arguments
    parser = argparse.ArgumentParser(description="Takes an input feature file \
                                                  and converts it to another \
                                                  format. Formats are \
                                                  determined automatically from\
                                                  file extensions.",
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('infile',
                        help='input feature file (ends in .jsonlines, .tsv, \
                              .csv, .arff, or .megam)')
    parser.add_argument('outfile',
                        help='output feature file (ends in .jsonlines, .tsv, \
                              .csv, .arff, or .megam)')
    parser.add_argument('-l', '--label_col',
                        help='Name of the column which contains the class \
                              labels in ARFF, CSV, or TSV files. For ARFF \
                              files, this must be the final column to count as\
                              the label.',
                        default='y')
    parser.add_argument('-q', '--quiet',
                        help='Suppress printing of "Loading..." messages.',
                        action='store_true')
    parser.add_argument('--arff_regression',
                        help='Create ARFF files for regression, not classification.',
                        action='store_true')
    parser.add_argument('--arff_relation',
                        help='Relation name to use for ARFF file.',
                        default='skll_relation')
    parser.add_argument('--reuse_libsvm_map',
                        help='If you want to output multiple files that use \
                              the same mapping from classes and features to \
                              numbers when writing libsvm files, you can \
                              specify an existing .libsvm file to reuse the \
                              mapping from.',
                        type=argparse.FileType('rb'))
    parser.add_argument('--version', action='version',
                        version='%(prog)s {0}'.format(__version__))
    args = parser.parse_args(argv)

    # Make warnings from built-in warnings module get formatted more nicely
    logging.captureWarnings(True)
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - '
                                '%(message)s'))
    logger = logging.getLogger(__name__)

    # make sure the input file extension is one we can process
    input_extension = os.path.splitext(args.infile)[1].lower()
    output_extension = os.path.splitext(args.outfile)[1].lower()

    if input_extension == ".tsv":
        example_iter_type = _TSVDictIter
    elif input_extension == ".jsonlines" or input_extension == '.ndj':
        example_iter_type = _JSONDictIter
    elif input_extension == ".libsvm":
        example_iter_type = _LibSVMDictIter
    elif input_extension == ".megam":
        example_iter_type = _MegaMDictIter
    elif input_extension == ".csv":
        example_iter_type = _CSVDictIter
    elif input_extension == ".arff":
        example_iter_type = _ARFFDictIter
    else:
        logger.error(('Input file must be in either .arff, .csv, .jsonlines, '
                      '.libsvm, .megam, .ndj, or .tsv format. You specified: '
                      '{}').format(input_extension))
        sys.exit(1)

    # Build feature and label vectorizers from existing libsvm file if asked
    if args.reuse_libsvm_map and output_extension == '.libsvm':
        feat_map = {}
        label_map = {}
        for line in args.reuse_libsvm_map:
            line = UnicodeDammit(line, ['utf-8',
                                        'windows-1252']).unicode_markup
            if '#' not in line:
                logger.error('The LibSVM file you want to reuse the map from '
                             'was not created by SKLL and does not actually '
                             'contain the necessary mapping info.')
                sys.exit(1)
            comments = line.split('#')[1]
            _, label_map_str, feat_map_str = comments.split('|')
            feat_map.update(_pair_to_dict_tuple(pair) for pair in
                            feat_map_str.strip())
            label_map.update(_pair_to_dict_tuple(pair) for pair in
                             label_map_str
                             .strip())
        feat_vectorizer = DictVectorizer()
        feat_vectorizer.fit([{name: 1} for name in feat_map])
        feat_vectorizer.vocabulary_ = feat_map
    else:
        feat_vectorizer = None
        label_map = None

    # Iterate through input file and collect the information we need
    ids = []
    classes = []
    feature_dicts = []
    example_iter = example_iter_type(args.infile, quiet=args.quiet,
                                     label_col=args.label_col)
    for example_id, class_name, feature_dict in example_iter:
        feature_dicts.append(feature_dict)
        classes.append(class_name)
        ids.append(example_id)

    # write out the file in the requested output format
    write_feature_file(args.outfile, ids, classes, feature_dicts,
                       arff_regression=args.arff_regression,
                       arff_relation=args.arff_relation,
                       feat_vectorizer=feat_vectorizer,
                       label_map=label_map)
示例#13
0
def main(argv=None):
    '''
    Handles command line arguments and gets things started.

    :param argv: List of arguments, as if specified on the command-line.
                 If None, ``sys.argv[1:]`` is used instead.
    :type argv: list of str
    '''
    # Get command line arguments
    parser = argparse.ArgumentParser(description="Takes an input feature file \
                                                  and converts it to another \
                                                  format. Formats are \
                                                  determined automatically from\
                                                  file extensions.",
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('infile',
                        help='input feature file (ends in .jsonlines, .tsv, \
                              .csv, .arff, or .megam)')
    parser.add_argument('outfile',
                        help='output feature file (ends in .jsonlines, .tsv, \
                              .csv, .arff, or .megam)')
    parser.add_argument('-l', '--label_col',
                        help='Name of the column which contains the class \
                              labels in ARFF, CSV, or TSV files. For ARFF \
                              files, this must be the final column to count as\
                              the label.',
                        default='y')
    parser.add_argument('-q', '--quiet',
                        help='Suppress printing of "Loading..." messages.',
                        action='store_true')
    parser.add_argument('--arff_regression',
                        help='Create ARFF files for regression, not classification.',
                        action='store_true')
    parser.add_argument('--arff_relation',
                        help='Relation name to use for ARFF file.',
                        default='skll_relation')
    parser.add_argument('--version', action='version',
                        version='%(prog)s {0}'.format(__version__))
    args = parser.parse_args(argv)

    # Make warnings from built-in warnings module get formatted more nicely
    logging.captureWarnings(True)
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' +
                                '%(message)s'))
    logger = logging.getLogger(__name__)

    # make sure the input file extension is one we can process
    input_extension = os.path.splitext(args.infile)[1].lower()

    if input_extension == ".tsv":
        example_iter_type = _TSVDictIter
    elif input_extension == ".jsonlines" or input_extension == '.ndj':
        example_iter_type = _JSONDictIter
    elif input_extension == ".megam":
        example_iter_type = _MegaMDictIter
    elif input_extension == ".csv":
        example_iter_type = _CSVDictIter
    elif input_extension == ".arff":
        example_iter_type = _ARFFDictIter
    else:
        logger.error(('Input file must be in either .arff, .csv, .jsonlines, ' +
                      '.megam, .ndj, or .tsv format. You specified: ' +
                      '{}').format(input_extension))
        sys.exit(1)


    # Iterate through input file and collect the information we need
    ids = []
    classes = []
    feature_dicts = []
    example_iter = example_iter_type(args.infile, quiet=args.quiet,
                                     label_col=args.label_col)
    for example_id, class_name, feature_dict in example_iter:
        feature_dicts.append(feature_dict)
        classes.append(class_name)
        ids.append(example_id)

    # write out the file in the requested output format
    write_feature_file(args.outfile, ids, classes, feature_dicts,
                       arff_regression=args.arff_regression,
                       arff_relation=args.arff_relation)
示例#14
0
def main(argv=None):
    '''
    Handles command line arguments and gets things started.

    :param argv: List of arguments, as if specified on the command-line.
                 If None, ``sys.argv[1:]`` is used instead.
    :type argv: list of str
    '''
    # Get command line arguments
    parser = argparse.ArgumentParser(
        description="Takes an input feature file \
                                                  and converts it to another \
                                                  format. Formats are \
                                                  determined automatically from\
                                                  file extensions.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('infile',
                        help='input feature file (ends in .jsonlines, .tsv, \
                              .csv, .arff, or .megam)')
    parser.add_argument('outfile',
                        help='output feature file (ends in .jsonlines, .tsv, \
                              .csv, .arff, or .megam)')
    parser.add_argument('-l',
                        '--label_col',
                        help='Name of the column which contains the class \
                              labels in ARFF, CSV, or TSV files. For ARFF \
                              files, this must be the final column to count as\
                              the label.',
                        default='y')
    parser.add_argument('-q',
                        '--quiet',
                        help='Suppress printing of "Loading..." messages.',
                        action='store_true')
    parser.add_argument(
        '--arff_regression',
        help='Create ARFF files for regression, not classification.',
        action='store_true')
    parser.add_argument('--arff_relation',
                        help='Relation name to use for ARFF file.',
                        default='skll_relation')
    parser.add_argument('--reuse_libsvm_map',
                        help='If you want to output multiple files that use \
                              the same mapping from classes and features to \
                              numbers when writing libsvm files, you can \
                              specify an existing .libsvm file to reuse the \
                              mapping from.',
                        type=argparse.FileType('rb'))
    parser.add_argument('--version',
                        action='version',
                        version='%(prog)s {0}'.format(__version__))
    args = parser.parse_args(argv)

    # Make warnings from built-in warnings module get formatted more nicely
    logging.captureWarnings(True)
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - '
                                '%(message)s'))
    logger = logging.getLogger(__name__)

    # make sure the input file extension is one we can process
    input_extension = os.path.splitext(args.infile)[1].lower()
    output_extension = os.path.splitext(args.outfile)[1].lower()

    if input_extension == ".tsv":
        example_iter_type = _TSVDictIter
    elif input_extension == ".jsonlines" or input_extension == '.ndj':
        example_iter_type = _JSONDictIter
    elif input_extension == ".libsvm":
        example_iter_type = _LibSVMDictIter
    elif input_extension == ".megam":
        example_iter_type = _MegaMDictIter
    elif input_extension == ".csv":
        example_iter_type = _CSVDictIter
    elif input_extension == ".arff":
        example_iter_type = _ARFFDictIter
    else:
        logger.error(('Input file must be in either .arff, .csv, .jsonlines, '
                      '.libsvm, .megam, .ndj, or .tsv format. You specified: '
                      '{}').format(input_extension))
        sys.exit(1)

    # Build feature and label vectorizers from existing libsvm file if asked
    if args.reuse_libsvm_map and output_extension == '.libsvm':
        feat_map = {}
        label_map = {}
        for line in args.reuse_libsvm_map:
            line = UnicodeDammit(line,
                                 ['utf-8', 'windows-1252']).unicode_markup
            if '#' not in line:
                logger.error('The LibSVM file you want to reuse the map from '
                             'was not created by SKLL and does not actually '
                             'contain the necessary mapping info.')
                sys.exit(1)
            comments = line.split('#')[1]
            _, label_map_str, feat_map_str = comments.split('|')
            feat_map.update(
                _pair_to_dict_tuple(pair) for pair in feat_map_str.strip())
            label_map.update(
                _pair_to_dict_tuple(pair) for pair in label_map_str.strip())
        feat_vectorizer = DictVectorizer()
        feat_vectorizer.fit([{name: 1} for name in feat_map])
        feat_vectorizer.vocabulary_ = feat_map
    else:
        feat_vectorizer = None
        label_map = None

    # Iterate through input file and collect the information we need
    ids = []
    classes = []
    feature_dicts = []
    example_iter = example_iter_type(args.infile,
                                     quiet=args.quiet,
                                     label_col=args.label_col)
    for example_id, class_name, feature_dict in example_iter:
        feature_dicts.append(feature_dict)
        classes.append(class_name)
        ids.append(example_id)

    # write out the file in the requested output format
    write_feature_file(args.outfile,
                       ids,
                       classes,
                       feature_dicts,
                       arff_regression=args.arff_regression,
                       arff_relation=args.arff_relation,
                       feat_vectorizer=feat_vectorizer,
                       label_map=label_map)