def make_merging_data(num_feat_files, suffix, numeric_ids): num_examples = 500 num_feats_per_file = 17 np.random.seed(1234567890) merge_dir = join(_my_dir, 'train', 'test_merging') if not exists(merge_dir): os.makedirs(merge_dir) # Create lists we will write files from ids = [] features = [] labels = [] for j in range(num_examples): y = "dog" if j % 2 == 0 else "cat" ex_id = "{}{}".format(y, j) if not numeric_ids else j x = { "f{:03d}".format(feat_num): np.random.randint(0, 4) for feat_num in range(num_feat_files * num_feats_per_file) } x = OrderedDict(sorted(x.items(), key=lambda t: t[0])) ids.append(ex_id) labels.append(y) features.append(x) # Unmerged subset_dict = {} for i in range(num_feat_files): feat_num = i * num_feats_per_file subset_dict['{}'.format(i)] = [ "f{:03d}".format(feat_num + j) for j in range(num_feats_per_file) ] train_path = join(merge_dir, suffix) train_fs = FeatureSet('train', ids, labels=labels, features=features) Writer.for_path(train_path, train_fs, subsets=subset_dict).write() # Merged train_path = join(merge_dir, 'all{}'.format(suffix)) Writer.for_path(train_path, train_fs).write()
def make_merging_data(num_feat_files, suffix, numeric_ids): num_examples = 500 num_feats_per_file = 17 np.random.seed(1234567890) merge_dir = join(_my_dir, 'train', 'test_merging') if not exists(merge_dir): os.makedirs(merge_dir) # Create lists we will write files from ids = [] features = [] labels = [] for j in range(num_examples): y = "dog" if j % 2 == 0 else "cat" ex_id = "{}{}".format(y, j) if not numeric_ids else j x = {"f{:03d}".format(feat_num): np.random.randint(0, 4) for feat_num in range(num_feat_files * num_feats_per_file)} x = OrderedDict(sorted(x.items(), key=lambda t: t[0])) ids.append(ex_id) labels.append(y) features.append(x) # Unmerged subset_dict = {} for i in range(num_feat_files): feat_num = i * num_feats_per_file subset_dict['{}'.format(i)] = ["f{:03d}".format(feat_num + j) for j in range(num_feats_per_file)] train_path = join(merge_dir, suffix) train_fs = FeatureSet('train', ids, labels=labels, features=features) Writer.for_path(train_path, train_fs, subsets=subset_dict).write() # Merged train_path = join(merge_dir, 'all{}'.format(suffix)) Writer.for_path(train_path, train_fs).write()
def main(): """ Create directories and split CSV files into subsets. """ logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' + '%(message)s'), level=logging.INFO) logger = logging.getLogger(__name__) # Create dictionary of subsets to use for creating split feature files subset_dict = { 'vitals': ['Sex', 'Age'], 'socioeconomic': ['Pclass', 'Fare'], 'family': ['SibSp', 'Parch'], 'misc': ['Embarked'] } features_to_keep = list(chain(*subset_dict.values())) # Create directories to store files if not os.path.exists('titanic/train'): logger.info('Creating titanic/train directory') os.makedirs('titanic/train') if not os.path.exists('titanic/dev'): logger.info('Creating titanic/dev directory') os.makedirs('titanic/dev') if not os.path.exists('titanic/train+dev'): logger.info('Creating titanic/train+dev directory') os.makedirs('titanic/train+dev') if not os.path.exists('titanic/test'): logger.info('Creating titanic/test directory') os.makedirs('titanic/test') usecols_train = features_to_keep + ['PassengerId', 'Survived'] usecols_test = features_to_keep + ['PassengerId'] # Read and write training FeatureSet train_fs = Reader.for_path('titanic/train.csv', label_col='Survived', id_col='PassengerId', drop_blanks=True, pandas_kwargs={ 'usecols': usecols_train }, quiet=False, sparse=False).read() train_fs.filter(features=features_to_keep) num_train_dev = len(train_fs) num_train = int((num_train_dev / 5) * 4) writer = Writer.for_path('titanic/train/.csv', train_fs[:num_train], id_col='PassengerId', label_col='Survived', quiet=False, subsets=subset_dict) writer.write() # Write train+dev set for training model to use to generate predictions on # test writer = Writer.for_path('titanic/train+dev/.csv', train_fs, label_col='Survived', id_col='PassengerId', quiet=False, subsets=subset_dict) writer.write() # Write dev FeatureSet writer = Writer.for_path('titanic/dev/.csv', train_fs[num_train:], label_col='Survived', id_col='PassengerId', quiet=False, subsets=subset_dict) writer.write() # Read and write test FeatureSet test_fs = Reader.for_path('titanic/test.csv', label_col='Survived', drop_blanks=True, pandas_kwargs={ 'usecols': usecols_test }, quiet=False, sparse=False).read() test_fs.filter(features=features_to_keep) num_test = len(test_fs) test_fs.ids = list(range(num_train_dev + 1, num_test + num_train_dev + 1)) writer = Writer.for_path('titanic/test/.csv', test_fs, id_col='PassengerId', quiet=False, subsets=subset_dict) writer.write()
def make_conversion_data(num_feat_files, from_suffix, to_suffix, with_labels=True): num_examples = 500 num_feats_per_file = 7 np.random.seed(1234567890) convert_dir = join(_my_dir, 'train', 'test_conversion') if not exists(convert_dir): os.makedirs(convert_dir) # Create lists we will write files from ids = [] features = [] labels = [] if with_labels else None for j in range(num_examples): y = "dog" if j % 2 == 0 else "cat" ex_id = "{}{}".format(y, j) # if we are not using labels, we do not want zero-valued features # because it may be the case that some subset of features end up # being all 0 and if this subset ends up being written out to a file # below, then for some formats (e.g., megam) nothing will get written # out which can cause issues when reading this file lowest_feature_value = 0 if with_labels else 1 x = { "f{:03d}".format(feat_num): np.random.randint(lowest_feature_value, 4 + lowest_feature_value) for feat_num in range(num_feat_files * num_feats_per_file) } x = OrderedDict(sorted(x.items(), key=lambda t: t[0])) ids.append(ex_id) if with_labels: labels.append(y) features.append(x) # Create vectorizers/maps for libsvm subset writing feat_vectorizer = DictVectorizer() feat_vectorizer.fit(features) if with_labels: label_map = { label: num for num, label in enumerate( sorted({ label for label in labels if not isinstance(label, (int, float)) })) } # Add fake item to vectorizer for None label_map[None] = '00000' else: label_map = None # get the feature name prefix feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'), to_suffix.lstrip('.')) # use '_unlabeled' as part of any file names when not using labels with_labels_part = '' if with_labels else '_unlabeled' # Write out unmerged features in the `from_suffix` file format for i in range(num_feat_files): train_path = join( convert_dir, '{}_{}{}{}'.format(feature_name_prefix, i, with_labels_part, from_suffix)) sub_features = [] for example_num in range(num_examples): feat_num = i * num_feats_per_file x = { "f{:03d}".format(feat_num + j): features[example_num]["f{:03d}".format(feat_num + j)] for j in range(num_feats_per_file) } sub_features.append(x) train_fs = FeatureSet('sub_train', ids, labels=labels, features=sub_features, vectorizer=feat_vectorizer) if from_suffix == '.libsvm': Writer.for_path(train_path, train_fs, label_map=label_map).write() elif from_suffix in ['.arff', '.csv', '.tsv']: label_col = 'y' if with_labels else None Writer.for_path(train_path, train_fs, label_col=label_col).write() else: Writer.for_path(train_path, train_fs).write() # Write out the merged features in the `to_suffix` file format train_path = join( convert_dir, '{}{}_all{}'.format(feature_name_prefix, with_labels_part, to_suffix)) train_fs = FeatureSet('train', ids, labels=labels, features=features, vectorizer=feat_vectorizer) # we need to do this to get around the FeatureSet using NaNs # instead of None when there are no labels which causes problems # later when comparing featuresets if not with_labels: train_fs.labels = [None] * len(train_fs.labels) if to_suffix == '.libsvm': Writer.for_path(train_path, train_fs, label_map=label_map).write() elif to_suffix in ['.arff', '.csv', '.tsv']: label_col = 'y' if with_labels else None Writer.for_path(train_path, train_fs, label_col=label_col).write() else: Writer.for_path(train_path, train_fs).write()
def make_conversion_data(num_feat_files, from_suffix, to_suffix): num_examples = 500 num_feats_per_file = 7 np.random.seed(1234567890) convert_dir = join(_my_dir, 'train', 'test_conversion') if not exists(convert_dir): os.makedirs(convert_dir) # Create lists we will write files from ids = [] features = [] labels = [] for j in range(num_examples): y = "dog" if j % 2 == 0 else "cat" ex_id = "{}{}".format(y, j) x = {"f{:03d}".format(feat_num): np.random.randint(0, 4) for feat_num in range(num_feat_files * num_feats_per_file)} x = OrderedDict(sorted(x.items(), key=lambda t: t[0])) ids.append(ex_id) labels.append(y) features.append(x) # Create vectorizers/maps for libsvm subset writing feat_vectorizer = DictVectorizer() feat_vectorizer.fit(features) label_map = {label: num for num, label in enumerate(sorted({label for label in labels if not isinstance(label, (int, float))}))} # Add fake item to vectorizer for None label_map[None] = '00000' # get the feature name prefix feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'), to_suffix.lstrip('.')) # Write out unmerged features in the `from_suffix` file format for i in range(num_feat_files): train_path = join(convert_dir, '{}_{}{}'.format(feature_name_prefix, i, from_suffix)) sub_features = [] for example_num in range(num_examples): feat_num = i * num_feats_per_file x = {"f{:03d}".format(feat_num + j): features[example_num]["f{:03d}".format(feat_num + j)] for j in range(num_feats_per_file)} sub_features.append(x) train_fs = FeatureSet('sub_train', ids, labels=labels, features=sub_features, vectorizer=feat_vectorizer) if from_suffix == '.libsvm': Writer.for_path(train_path, train_fs, label_map=label_map).write() else: Writer.for_path(train_path, train_fs).write() # Write out the merged features in the `to_suffix` file format train_path = join(convert_dir, '{}_all{}'.format(feature_name_prefix, to_suffix)) train_fs = FeatureSet('train', ids, labels=labels, features=features, vectorizer=feat_vectorizer) if to_suffix == '.libsvm': Writer.for_path(train_path, train_fs, label_map=label_map).write() else: Writer.for_path(train_path, train_fs).write()
def make_conversion_data(num_feat_files, from_suffix, to_suffix): num_examples = 500 num_feats_per_file = 7 np.random.seed(1234567890) convert_dir = join(_my_dir, 'train', 'test_conversion') if not exists(convert_dir): os.makedirs(convert_dir) # Create lists we will write files from ids = [] features = [] labels = [] for j in range(num_examples): y = "dog" if j % 2 == 0 else "cat" ex_id = "{}{}".format(y, j) x = { "f{:03d}".format(feat_num): np.random.randint(0, 4) for feat_num in range(num_feat_files * num_feats_per_file) } x = OrderedDict(sorted(x.items(), key=lambda t: t[0])) ids.append(ex_id) labels.append(y) features.append(x) # Create vectorizers/maps for libsvm subset writing feat_vectorizer = DictVectorizer() feat_vectorizer.fit(features) label_map = { label: num for num, label in enumerate( sorted({ label for label in labels if not isinstance(label, (int, float)) })) } # Add fake item to vectorizer for None label_map[None] = '00000' # get the feature name prefix feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'), to_suffix.lstrip('.')) # Write out unmerged features in the `from_suffix` file format for i in range(num_feat_files): train_path = join( convert_dir, '{}_{}{}'.format(feature_name_prefix, i, from_suffix)) sub_features = [] for example_num in range(num_examples): feat_num = i * num_feats_per_file x = { "f{:03d}".format(feat_num + j): features[example_num]["f{:03d}".format(feat_num + j)] for j in range(num_feats_per_file) } sub_features.append(x) train_fs = FeatureSet('sub_train', ids, labels=labels, features=sub_features, vectorizer=feat_vectorizer) if from_suffix == '.libsvm': Writer.for_path(train_path, train_fs, label_map=label_map).write() else: Writer.for_path(train_path, train_fs).write() # Write out the merged features in the `to_suffix` file format train_path = join(convert_dir, '{}_all{}'.format(feature_name_prefix, to_suffix)) train_fs = FeatureSet('train', ids, labels=labels, features=features, vectorizer=feat_vectorizer) if to_suffix == '.libsvm': Writer.for_path(train_path, train_fs, label_map=label_map).write() else: Writer.for_path(train_path, train_fs).write()
def make_conversion_data(num_feat_files, from_suffix, to_suffix, with_labels=True): num_examples = 500 num_feats_per_file = 7 np.random.seed(1234567890) convert_dir = join(_my_dir, 'train', 'test_conversion') if not exists(convert_dir): os.makedirs(convert_dir) # Create lists we will write files from ids = [] features = [] labels = [] if with_labels else None for j in range(num_examples): y = "dog" if j % 2 == 0 else "cat" ex_id = "{}{}".format(y, j) # if we are not using labels, we do not want zero-valued features # because it may be the case that some subset of features end up # being all 0 and if this subset ends up being written out to a file # below, then for some formats (e.g., megam) nothing will get written # out which can cause issues when reading this file lowest_feature_value = 0 if with_labels else 1 x = {"f{:03d}".format(feat_num): np.random.randint(lowest_feature_value, 4 + lowest_feature_value) for feat_num in range(num_feat_files * num_feats_per_file)} x = OrderedDict(sorted(x.items(), key=lambda t: t[0])) ids.append(ex_id) if with_labels: labels.append(y) features.append(x) # Create vectorizers/maps for libsvm subset writing feat_vectorizer = DictVectorizer() feat_vectorizer.fit(features) if with_labels: label_map = {label: num for num, label in enumerate(sorted({label for label in labels if not isinstance(label, (int, float))}))} # Add fake item to vectorizer for None label_map[None] = '00000' else: label_map = None # get the feature name prefix feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'), to_suffix.lstrip('.')) # use '_unlabeled' as part of any file names when not using labels with_labels_part = '' if with_labels else '_unlabeled' # Write out unmerged features in the `from_suffix` file format for i in range(num_feat_files): train_path = join(convert_dir, '{}_{}{}{}'.format(feature_name_prefix, i, with_labels_part, from_suffix)) sub_features = [] for example_num in range(num_examples): feat_num = i * num_feats_per_file x = {"f{:03d}".format(feat_num + j): features[example_num]["f{:03d}".format(feat_num + j)] for j in range(num_feats_per_file)} sub_features.append(x) train_fs = FeatureSet('sub_train', ids, labels=labels, features=sub_features, vectorizer=feat_vectorizer) if from_suffix == '.libsvm': Writer.for_path(train_path, train_fs, label_map=label_map).write() elif from_suffix in ['.arff', '.csv', '.tsv']: label_col = 'y' if with_labels else None Writer.for_path(train_path, train_fs, label_col=label_col).write() else: Writer.for_path(train_path, train_fs).write() # Write out the merged features in the `to_suffix` file format train_path = join(convert_dir, '{}{}_all{}'.format(feature_name_prefix, with_labels_part, to_suffix)) train_fs = FeatureSet('train', ids, labels=labels, features=features, vectorizer=feat_vectorizer) # we need to do this to get around the FeatureSet using NaNs # instead of None when there are no labels which causes problems # later when comparing featuresets if not with_labels: train_fs.labels = [None] * len(train_fs.labels) if to_suffix == '.libsvm': Writer.for_path(train_path, train_fs, label_map=label_map).write() elif to_suffix in ['.arff', '.csv', '.tsv']: label_col = 'y' if with_labels else None Writer.for_path(train_path, train_fs, label_col=label_col).write() else: Writer.for_path(train_path, train_fs).write()