def make_ablation_data(): # Remove old CV data for old_file in glob.glob(os.path.join(_my_dir, 'output', 'ablation_cv_*.results')): os.remove(old_file) num_examples = 1000 np.random.seed(1234567890) # Create lists we will write files from ids = [] features = [] classes = [] for j in range(num_examples): y = "dog" if j % 2 == 0 else "cat" ex_id = "{}{}".format(y, j) x = {"f{}".format(feat_num): np.random.randint(0, 4) for feat_num in range(5)} x = OrderedDict(sorted(x.items(), key=lambda t: t[0])) ids.append(ex_id) classes.append(y) features.append(x) for i in range(5): train_path = os.path.join(_my_dir, 'train', 'f{}.jsonlines'.format(i)) sub_features = [] for example_num in range(num_examples): feat_num = i x = {"f{}".format(feat_num): features[example_num]["f{}".format(feat_num)]} sub_features.append(x) write_feature_file(train_path, ids, classes, sub_features)
def make_sparse_data(): # Create training file train_path = os.path.join(_my_dir, 'train', 'test_sparse.jsonlines') ids = [] classes = [] features = [] for i in range(1, 101): y = "dog" if i % 2 == 0 else "cat" ex_id = "{}{}".format(y, i) # note that f1 and f5 are missing in all instances but f4 is not x = {"f2": i+1, "f3": i+2, "f4": i+5} ids.append(ex_id) classes.append(y) features.append(x) write_feature_file(train_path, ids, classes, features) # Create test file test_path = os.path.join(_my_dir, 'test', 'test_sparse.jsonlines') ids = [] classes = [] features = [] for i in range(1, 51): y = "dog" if i % 2 == 0 else "cat" ex_id = "{}{}".format(y, i) # f1 and f5 are not missing in any instances here but f4 is x = {"f1": i, "f2": i+2, "f3": i % 10, "f5": i * 2} ids.append(ex_id) classes.append(y) features.append(x) write_feature_file(test_path, ids, classes, features)
def make_class_map_data(): # Create training file train_path = os.path.join(_my_dir, 'train', 'test_class_map.jsonlines') ids = [] classes = [] features = [] class_names = ['beagle', 'cat', 'dachsund', 'cat'] for i in range(1, 101): y = class_names[i % 4] ex_id = "{}{}".format(y, i) # note that f1 and f5 are missing in all instances but f4 is not x = {"f2": i+1, "f3": i+2, "f4": i+5} ids.append(ex_id) classes.append(y) features.append(x) write_feature_file(train_path, ids, classes, features) # Create test file test_path = os.path.join(_my_dir, 'test', 'test_class_map.jsonlines') ids = [] classes = [] features = [] for i in range(1, 51): y = class_names[i % 4] ex_id = "{}{}".format(y, i) # f1 and f5 are not missing in any instances here but f4 is x = {"f1": i, "f2": i+2, "f3": i % 10, "f5": i * 2} ids.append(ex_id) classes.append(y) features.append(x) write_feature_file(test_path, ids, classes, features)
def make_regression_data(): num_examples = 2000 num_train_examples = int(num_examples / 2) np.random.seed(1234567890) f1 = np.random.rand(num_examples) f2 = np.random.rand(num_examples) f3 = np.random.rand(num_examples) err = np.random.randn(num_examples) / 2.0 y = 1.0 * f1 + 1.0 * f2 - 2.0 * f3 + err y = y.tolist() # Write training file train_dir = os.path.join(_my_dir, 'train') if not os.path.exists(train_dir): os.makedirs(train_dir) train_path = os.path.join(train_dir, 'test_regression1.jsonlines') features = [{"f1": f1[i], "f2": f2[i], "f3": f3[i]} for i in range(num_train_examples)] write_feature_file(train_path, None, y[:num_train_examples], features) # Write test file test_dir = os.path.join(_my_dir, 'test') if not os.path.exists(test_dir): os.makedirs(test_dir) test_path = os.path.join(test_dir, 'test_regression1.jsonlines') features = [{"f1": f1[i], "f2": f2[i], "f3": f3[i]} for i in range(num_train_examples, num_examples)] write_feature_file(test_path, None, y[num_train_examples: num_examples], features) return y
def make_summary_data(): num_train_examples = 500 num_test_examples = 100 np.random.seed(1234567890) # Write training file train_path = os.path.join(_my_dir, 'train', 'test_summary.jsonlines') classes = [] ids = [] features = [] for i in range(num_train_examples): y = "dog" if i % 2 == 0 else "cat" ex_id = "{}{}".format(y, i) x = {"f1": np.random.randint(1, 4), "f2": np.random.randint(1, 4), "f3": np.random.randint(1, 4)} classes.append(y) ids.append(ex_id) features.append(x) write_feature_file(train_path, ids, classes, features) # Write test file test_path = os.path.join(_my_dir, 'test', 'test_summary.jsonlines') classes = [] ids = [] features = [] for i in range(num_test_examples): y = "dog" if i % 2 == 0 else "cat" ex_id = "{}{}".format(y, i) x = {"f1": np.random.randint(1, 4), "f2": np.random.randint(1, 4), "f3": np.random.randint(1, 4)} classes.append(y) ids.append(ex_id) features.append(x) write_feature_file(test_path, ids, classes, features)
def make_scaling_data(): num_train_examples = 1000 num_test_examples = 100 np.random.seed(1234567890) # create training data ids = [] features = [] classes = [] for j in range(num_train_examples): y = "dog" if j % 2 == 0 else "cat" ex_id = "{}{}".format(y, j) x = { "g{}".format(feat_num): np.random.randint(0, 4) for feat_num in range(5) } x = OrderedDict(sorted(x.items(), key=lambda t: t[0])) ids.append(ex_id) classes.append(y) features.append(x) for i in range(5): train_path = os.path.join(_my_dir, 'train', 'g{}.jsonlines'.format(i)) sub_features = [] for example_num in range(num_train_examples): feat_num = i x = { "g{}".format(feat_num): features[example_num]["g{}".format(feat_num)] } sub_features.append(x) write_feature_file(train_path, ids, classes, sub_features) # create the test data for j in range(num_test_examples): y = "dog" if j % 2 == 0 else "cat" ex_id = "{}{}".format(y, j) x = { "g{}".format(feat_num): np.random.randint(0, 4) for feat_num in range(5) } x = OrderedDict(sorted(x.items(), key=lambda t: t[0])) ids.append(ex_id) classes.append(y) features.append(x) for i in range(5): train_path = os.path.join(_my_dir, 'test', 'g{}.jsonlines'.format(i)) sub_features = [] for example_num in range(num_test_examples): feat_num = i x = { "g{}".format(feat_num): features[example_num]["g{}".format(feat_num)] } sub_features.append(x) write_feature_file(train_path, ids, classes, sub_features)
def make_conversion_data(num_feat_files, from_suffix, to_suffix): num_examples = 500 num_feats_per_file = 7 np.random.seed(1234567890) convert_dir = os.path.join(_my_dir, 'train', 'test_conversion') if not os.path.exists(convert_dir): os.makedirs(convert_dir) # Create lists we will write files from ids = [] features = [] classes = [] for j in range(num_examples): y = "dog" if j % 2 == 0 else "cat" ex_id = "{}{}".format(y, j) x = { "f{:03d}".format(feat_num): np.random.randint(0, 4) for feat_num in range(num_feat_files * num_feats_per_file) } x = OrderedDict(sorted(x.items(), key=lambda t: t[0])) ids.append(ex_id) classes.append(y) features.append(x) # get the feature name prefix feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'), to_suffix.lstrip('.')) # Write out unmerged features in the `from_suffix` file format for i in range(num_feat_files): train_path = os.path.join( convert_dir, '{}_{}{}'.format(feature_name_prefix, i, from_suffix)) sub_features = [] for example_num in range(num_examples): feat_num = i * num_feats_per_file x = { "f{:03d}".format(feat_num + j): features[example_num]["f{:03d}".format(feat_num + j)] for j in range(num_feats_per_file) } sub_features.append(x) write_feature_file(train_path, ids, classes, sub_features) # Write out the merged features in the `to_suffix` file format train_path = os.path.join( convert_dir, '{}_all{}'.format(feature_name_prefix, to_suffix)) write_feature_file(train_path, ids, classes, features)
def make_scaling_data(): num_train_examples = 1000 num_test_examples = 100 np.random.seed(1234567890) # create training data ids = [] features = [] classes = [] for j in range(num_train_examples): y = "dog" if j % 2 == 0 else "cat" ex_id = "{}{}".format(y, j) x = {"g{}".format(feat_num): np.random.randint(0, 4) for feat_num in range(5)} x = OrderedDict(sorted(x.items(), key=lambda t: t[0])) ids.append(ex_id) classes.append(y) features.append(x) for i in range(5): train_path = os.path.join(_my_dir, 'train', 'g{}.jsonlines'.format(i)) sub_features = [] for example_num in range(num_train_examples): feat_num = i x = {"g{}".format(feat_num): features[example_num]["g{}".format(feat_num)]} sub_features.append(x) write_feature_file(train_path, ids, classes, sub_features) # create the test data for j in range(num_test_examples): y = "dog" if j % 2 == 0 else "cat" ex_id = "{}{}".format(y, j) x = {"g{}".format(feat_num): np.random.randint(0, 4) for feat_num in range(5)} x = OrderedDict(sorted(x.items(), key=lambda t: t[0])) ids.append(ex_id) classes.append(y) features.append(x) for i in range(5): train_path = os.path.join(_my_dir, 'test', 'g{}.jsonlines'.format(i)) sub_features = [] for example_num in range(num_test_examples): feat_num = i x = {"g{}".format(feat_num): features[example_num]["g{}".format(feat_num)]} sub_features.append(x) write_feature_file(train_path, ids, classes, sub_features)
def make_conversion_data(num_feat_files, from_suffix, to_suffix): num_examples = 500 num_feats_per_file = 7 np.random.seed(1234567890) convert_dir = os.path.join(_my_dir, 'train', 'test_conversion') if not os.path.exists(convert_dir): os.makedirs(convert_dir) # Create lists we will write files from ids = [] features = [] classes = [] for j in range(num_examples): y = "dog" if j % 2 == 0 else "cat" ex_id = "{}{}".format(y, j) x = {"f{:03d}".format(feat_num): np.random.randint(0, 4) for feat_num in range(num_feat_files * num_feats_per_file)} x = OrderedDict(sorted(x.items(), key=lambda t: t[0])) ids.append(ex_id) classes.append(y) features.append(x) # get the feature name prefix feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'), to_suffix.lstrip('.')) # Write out unmerged features in the `from_suffix` file format for i in range(num_feat_files): train_path = os.path.join(convert_dir, '{}_{}{}'.format(feature_name_prefix, i, from_suffix)) sub_features = [] for example_num in range(num_examples): feat_num = i * num_feats_per_file x = {"f{:03d}".format(feat_num + j): features[example_num]["f{:03d}".format(feat_num + j)] for j in range(num_feats_per_file)} sub_features.append(x) write_feature_file(train_path, ids, classes, sub_features) # Write out the merged features in the `to_suffix` file format train_path = os.path.join(convert_dir, '{}_all{}'.format(feature_name_prefix, to_suffix)) write_feature_file(train_path, ids, classes, features)
def make_merging_data(num_feat_files, suffix, numeric_ids): num_examples = 500 num_feats_per_file = 17 np.random.seed(1234567890) merge_dir = os.path.join(_my_dir, 'train', 'test_merging') if not os.path.exists(merge_dir): os.makedirs(merge_dir) # Create lists we will write files from ids = [] features = [] classes = [] for j in range(num_examples): y = "dog" if j % 2 == 0 else "cat" ex_id = "{}{}".format(y, j) if not numeric_ids else j x = { "f{:03d}".format(feat_num): np.random.randint(0, 4) for feat_num in range(num_feat_files * num_feats_per_file) } x = OrderedDict(sorted(x.items(), key=lambda t: t[0])) ids.append(ex_id) classes.append(y) features.append(x) # Unmerged subset_dict = {} for i in range(num_feat_files): feat_num = i * num_feats_per_file subset_dict['{}'.format(i)] = [ "f{:03d}".format(feat_num + j) for j in range(num_feats_per_file) ] train_path = os.path.join(merge_dir, suffix) write_feature_file(train_path, ids, classes, features, subsets=subset_dict) # Merged train_path = os.path.join(merge_dir, 'all{}'.format(suffix)) write_feature_file(train_path, ids, classes, features)
def make_merging_data(num_feat_files, suffix, numeric_ids): num_examples = 500 num_feats_per_file = 17 np.random.seed(1234567890) merge_dir = os.path.join(_my_dir, 'train', 'test_merging') if not os.path.exists(merge_dir): os.makedirs(merge_dir) # Create lists we will write files from ids = [] features = [] classes = [] for j in range(num_examples): y = "dog" if j % 2 == 0 else "cat" ex_id = "{}{}".format(y, j) if not numeric_ids else j x = {"f{:03d}".format(feat_num): np.random.randint(0, 4) for feat_num in range(num_feat_files * num_feats_per_file)} x = OrderedDict(sorted(x.items(), key=lambda t: t[0])) ids.append(ex_id) classes.append(y) features.append(x) # Unmerged subset_dict = {} for i in range(num_feat_files): feat_num = i * num_feats_per_file subset_dict['{}'.format(i)] = ["f{:03d}".format(feat_num + j) for j in range(num_feats_per_file)] train_path = os.path.join(merge_dir, suffix) write_feature_file(train_path, ids, classes, features, subsets=subset_dict) # Merged train_path = os.path.join(merge_dir, 'all{}'.format(suffix)) write_feature_file(train_path, ids, classes, features)
def main(argv=None): ''' Handles command line arguments and gets things started. :param argv: List of arguments, as if specified on the command-line. If None, ``sys.argv[1:]`` is used instead. :type argv: list of str ''' # Get command line arguments parser = argparse.ArgumentParser(description="Takes an input feature file \ and converts it to another \ format. Formats are \ determined automatically from\ file extensions.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('infile', help='input feature file (ends in .jsonlines, .tsv, \ .csv, .arff, or .megam)') parser.add_argument('outfile', help='output feature file (ends in .jsonlines, .tsv, \ .csv, .arff, or .megam)') parser.add_argument('-l', '--label_col', help='Name of the column which contains the class \ labels in ARFF, CSV, or TSV files. For ARFF \ files, this must be the final column to count as\ the label.', default='y') parser.add_argument('-q', '--quiet', help='Suppress printing of "Loading..." messages.', action='store_true') parser.add_argument('--arff_regression', help='Create ARFF files for regression, not classification.', action='store_true') parser.add_argument('--arff_relation', help='Relation name to use for ARFF file.', default='skll_relation') parser.add_argument('--reuse_libsvm_map', help='If you want to output multiple files that use \ the same mapping from classes and features to \ numbers when writing libsvm files, you can \ specify an existing .libsvm file to reuse the \ mapping from.', type=argparse.FileType('rb')) parser.add_argument('--version', action='version', version='%(prog)s {0}'.format(__version__)) args = parser.parse_args(argv) # Make warnings from built-in warnings module get formatted more nicely logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' '%(message)s')) logger = logging.getLogger(__name__) # make sure the input file extension is one we can process input_extension = os.path.splitext(args.infile)[1].lower() output_extension = os.path.splitext(args.outfile)[1].lower() if input_extension == ".tsv": example_iter_type = _TSVDictIter elif input_extension == ".jsonlines" or input_extension == '.ndj': example_iter_type = _JSONDictIter elif input_extension == ".libsvm": example_iter_type = _LibSVMDictIter elif input_extension == ".megam": example_iter_type = _MegaMDictIter elif input_extension == ".csv": example_iter_type = _CSVDictIter elif input_extension == ".arff": example_iter_type = _ARFFDictIter else: logger.error(('Input file must be in either .arff, .csv, .jsonlines, ' '.libsvm, .megam, .ndj, or .tsv format. You specified: ' '{}').format(input_extension)) sys.exit(1) # Build feature and label vectorizers from existing libsvm file if asked if args.reuse_libsvm_map and output_extension == '.libsvm': feat_map = {} label_map = {} for line in args.reuse_libsvm_map: line = UnicodeDammit(line, ['utf-8', 'windows-1252']).unicode_markup if '#' not in line: logger.error('The LibSVM file you want to reuse the map from ' 'was not created by SKLL and does not actually ' 'contain the necessary mapping info.') sys.exit(1) comments = line.split('#')[1] _, label_map_str, feat_map_str = comments.split('|') feat_map.update(_pair_to_dict_tuple(pair) for pair in feat_map_str.strip()) label_map.update(_pair_to_dict_tuple(pair) for pair in label_map_str .strip()) feat_vectorizer = DictVectorizer() feat_vectorizer.fit([{name: 1} for name in feat_map]) feat_vectorizer.vocabulary_ = feat_map else: feat_vectorizer = None label_map = None # Iterate through input file and collect the information we need ids = [] classes = [] feature_dicts = [] example_iter = example_iter_type(args.infile, quiet=args.quiet, label_col=args.label_col) for example_id, class_name, feature_dict in example_iter: feature_dicts.append(feature_dict) classes.append(class_name) ids.append(example_id) # write out the file in the requested output format write_feature_file(args.outfile, ids, classes, feature_dicts, arff_regression=args.arff_regression, arff_relation=args.arff_relation, feat_vectorizer=feat_vectorizer, label_map=label_map)
def main(argv=None): ''' Handles command line arguments and gets things started. :param argv: List of arguments, as if specified on the command-line. If None, ``sys.argv[1:]`` is used instead. :type argv: list of str ''' # Get command line arguments parser = argparse.ArgumentParser(description="Takes an input feature file \ and converts it to another \ format. Formats are \ determined automatically from\ file extensions.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('infile', help='input feature file (ends in .jsonlines, .tsv, \ .csv, .arff, or .megam)') parser.add_argument('outfile', help='output feature file (ends in .jsonlines, .tsv, \ .csv, .arff, or .megam)') parser.add_argument('-l', '--label_col', help='Name of the column which contains the class \ labels in ARFF, CSV, or TSV files. For ARFF \ files, this must be the final column to count as\ the label.', default='y') parser.add_argument('-q', '--quiet', help='Suppress printing of "Loading..." messages.', action='store_true') parser.add_argument('--arff_regression', help='Create ARFF files for regression, not classification.', action='store_true') parser.add_argument('--arff_relation', help='Relation name to use for ARFF file.', default='skll_relation') parser.add_argument('--version', action='version', version='%(prog)s {0}'.format(__version__)) args = parser.parse_args(argv) # Make warnings from built-in warnings module get formatted more nicely logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' + '%(message)s')) logger = logging.getLogger(__name__) # make sure the input file extension is one we can process input_extension = os.path.splitext(args.infile)[1].lower() if input_extension == ".tsv": example_iter_type = _TSVDictIter elif input_extension == ".jsonlines" or input_extension == '.ndj': example_iter_type = _JSONDictIter elif input_extension == ".megam": example_iter_type = _MegaMDictIter elif input_extension == ".csv": example_iter_type = _CSVDictIter elif input_extension == ".arff": example_iter_type = _ARFFDictIter else: logger.error(('Input file must be in either .arff, .csv, .jsonlines, ' + '.megam, .ndj, or .tsv format. You specified: ' + '{}').format(input_extension)) sys.exit(1) # Iterate through input file and collect the information we need ids = [] classes = [] feature_dicts = [] example_iter = example_iter_type(args.infile, quiet=args.quiet, label_col=args.label_col) for example_id, class_name, feature_dict in example_iter: feature_dicts.append(feature_dict) classes.append(class_name) ids.append(example_id) # write out the file in the requested output format write_feature_file(args.outfile, ids, classes, feature_dicts, arff_regression=args.arff_regression, arff_relation=args.arff_relation)
def main(argv=None): ''' Handles command line arguments and gets things started. :param argv: List of arguments, as if specified on the command-line. If None, ``sys.argv[1:]`` is used instead. :type argv: list of str ''' # Get command line arguments parser = argparse.ArgumentParser( description="Takes an input feature file \ and converts it to another \ format. Formats are \ determined automatically from\ file extensions.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('infile', help='input feature file (ends in .jsonlines, .tsv, \ .csv, .arff, or .megam)') parser.add_argument('outfile', help='output feature file (ends in .jsonlines, .tsv, \ .csv, .arff, or .megam)') parser.add_argument('-l', '--label_col', help='Name of the column which contains the class \ labels in ARFF, CSV, or TSV files. For ARFF \ files, this must be the final column to count as\ the label.', default='y') parser.add_argument('-q', '--quiet', help='Suppress printing of "Loading..." messages.', action='store_true') parser.add_argument( '--arff_regression', help='Create ARFF files for regression, not classification.', action='store_true') parser.add_argument('--arff_relation', help='Relation name to use for ARFF file.', default='skll_relation') parser.add_argument('--reuse_libsvm_map', help='If you want to output multiple files that use \ the same mapping from classes and features to \ numbers when writing libsvm files, you can \ specify an existing .libsvm file to reuse the \ mapping from.', type=argparse.FileType('rb')) parser.add_argument('--version', action='version', version='%(prog)s {0}'.format(__version__)) args = parser.parse_args(argv) # Make warnings from built-in warnings module get formatted more nicely logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' '%(message)s')) logger = logging.getLogger(__name__) # make sure the input file extension is one we can process input_extension = os.path.splitext(args.infile)[1].lower() output_extension = os.path.splitext(args.outfile)[1].lower() if input_extension == ".tsv": example_iter_type = _TSVDictIter elif input_extension == ".jsonlines" or input_extension == '.ndj': example_iter_type = _JSONDictIter elif input_extension == ".libsvm": example_iter_type = _LibSVMDictIter elif input_extension == ".megam": example_iter_type = _MegaMDictIter elif input_extension == ".csv": example_iter_type = _CSVDictIter elif input_extension == ".arff": example_iter_type = _ARFFDictIter else: logger.error(('Input file must be in either .arff, .csv, .jsonlines, ' '.libsvm, .megam, .ndj, or .tsv format. You specified: ' '{}').format(input_extension)) sys.exit(1) # Build feature and label vectorizers from existing libsvm file if asked if args.reuse_libsvm_map and output_extension == '.libsvm': feat_map = {} label_map = {} for line in args.reuse_libsvm_map: line = UnicodeDammit(line, ['utf-8', 'windows-1252']).unicode_markup if '#' not in line: logger.error('The LibSVM file you want to reuse the map from ' 'was not created by SKLL and does not actually ' 'contain the necessary mapping info.') sys.exit(1) comments = line.split('#')[1] _, label_map_str, feat_map_str = comments.split('|') feat_map.update( _pair_to_dict_tuple(pair) for pair in feat_map_str.strip()) label_map.update( _pair_to_dict_tuple(pair) for pair in label_map_str.strip()) feat_vectorizer = DictVectorizer() feat_vectorizer.fit([{name: 1} for name in feat_map]) feat_vectorizer.vocabulary_ = feat_map else: feat_vectorizer = None label_map = None # Iterate through input file and collect the information we need ids = [] classes = [] feature_dicts = [] example_iter = example_iter_type(args.infile, quiet=args.quiet, label_col=args.label_col) for example_id, class_name, feature_dict in example_iter: feature_dicts.append(feature_dict) classes.append(class_name) ids.append(example_id) # write out the file in the requested output format write_feature_file(args.outfile, ids, classes, feature_dicts, arff_regression=args.arff_regression, arff_relation=args.arff_relation, feat_vectorizer=feat_vectorizer, label_map=label_map)