Python FeatureSet.FeatureSet示例，skll.data.FeatureSet.FeatureSet Python示例

示例#1

0

显示文件

def featureset_creation_from_dataframe_helper(with_labels, use_feature_hasher):
    """
    Helper function for the two unit tests for FeatureSet.from_data_frame().
    Since labels are optional, run two tests, one with, one without.
    """
    import pandas

    # First, setup the test data.
    # get a 100 instances with 4 features each
    X, y = make_classification(n_samples=100,
                               n_features=4,
                               n_informative=4,
                               n_redundant=0,
                               n_classes=3,
                               random_state=1234567890)

    # Not using 0 - 100 here because that would be pandas' default index names anyway.
    # So let's make sure pandas is using the ids we supply.
    ids = list(range(100, 200))

    featureset_name = 'test'

    # if use_feature_hashing, run these tests with a vectorizer
    feature_bins = 4
    vectorizer = (FeatureHasher(
        n_features=feature_bins) if use_feature_hasher else None)

    # convert the features into a list of dictionaries
    feature_names = ['f{}'.format(n) for n in range(1, 5)]
    features = []
    for row in X:
        features.append(dict(zip(feature_names, row)))

    # Now, create a FeatureSet object.
    if with_labels:
        expected = FeatureSet(featureset_name,
                              ids,
                              features=features,
                              labels=y,
                              vectorizer=vectorizer)
    else:
        expected = FeatureSet(featureset_name,
                              ids,
                              features=features,
                              vectorizer=vectorizer)

    # Also create a DataFrame and then create a FeatureSet from it.
    df = pandas.DataFrame(features, index=ids)
    if with_labels:
        df['y'] = y
        current = FeatureSet.from_data_frame(df,
                                             featureset_name,
                                             labels_column='y',
                                             vectorizer=vectorizer)
    else:
        current = FeatureSet.from_data_frame(df,
                                             featureset_name,
                                             vectorizer=vectorizer)

    return (expected, current)

示例#2

0

显示文件

文件： test_preprocessing.py 项目： nimmen/skll

def make_scaling_data(use_feature_hashing=False):

    X, y = make_classification(n_samples=1000, n_classes=2,
                               n_features=5, n_informative=5,
                               n_redundant=0, random_state=1234567890)

    # we want to arbitrary scale the various features to test the scaling
    scalers = np.array([1, 10, 100, 1000, 10000])
    X = X * scalers

    # since we want to use SKLL's FeatureSet class, we need to
    # create a list of IDs
    ids = ['EXAMPLE_{}'.format(n) for n in range(1, 1001)]

    # create a list of dictionaries as the features
    feature_names = ['f{}'.format(n) for n in range(1, 6)]
    features = []
    for row in X:
        features.append(dict(zip(feature_names, row)))

    # split everything into training and testing portions
    train_features, test_features = features[:800], features[800:]
    train_y, test_y = y[:800], y[800:]
    train_ids, test_ids = ids[:800], ids[800:]

    vectorizer = FeatureHasher(n_features=4) if use_feature_hashing else None
    train_fs = FeatureSet('train_scaling', train_ids,
                          features=train_features, labels=train_y,
                          vectorizer=vectorizer)
    test_fs = FeatureSet('test_scaling', test_ids,
                         features=test_features, labels=test_y,
                         vectorizer=vectorizer)

    return (train_fs, test_fs)

示例#3

0

显示文件

文件： test_output.py 项目： BenJamesbabala/skll

def make_learning_curve_data():

    # Load in the digits data set
    digits = load_digits()
    X, y = digits.data, digits.target

    # create featureset with all features
    feature_names = ['f{:02}'.format(n) for n in range(X.shape[1])]
    features = []
    for row in X:
        features.append(dict(zip(feature_names, row)))
    fs1 = FeatureSet('train1',
                     features=features,
                     labels=y,
                     ids=list(range(X.shape[0])))

    # Write this feature set to file
    train_path = join(_my_dir, 'train', 'test_learning_curve1.jsonlines')
    writer = NDJWriter(train_path, fs1)
    writer.write()

    # create featureset with all except the last feature
    feature_names = ['f{:02}'.format(n) for n in range(X.shape[1])]
    features = []
    for row in X:
        features.append(dict(zip(feature_names[:-1], row)))
    fs2 = FeatureSet('train2',
                     features=features,
                     labels=y,
                     ids=list(range(X.shape[0])))

    # Write this feature set to file
    train_path = join(_my_dir, 'train', 'test_learning_curve2.jsonlines')
    writer = NDJWriter(train_path, fs2)
    writer.write()

示例#4

0

显示文件

def create_jsonlines_feature_files(path):

    # we only need to create the feature files if they
    # don't already exist under the given path
    feature_files_to_create = [
        join(path, 'f{}.jsonlines'.format(i)) for i in range(6)
    ]
    if all([exists(ff) for ff in feature_files_to_create]):
        return
    else:
        num_examples = 1000
        np.random.seed(1234567890)

        # Create lists we will write files from
        ids = []
        features = []
        labels = []
        for j in range(num_examples):
            y = "dog" if j % 2 == 0 else "cat"
            ex_id = "{}{}".format(y, j)
            x = {
                "f{}".format(feat_num): np.random.randint(0, 4)
                for feat_num in range(5)
            }
            x = OrderedDict(sorted(x.items(), key=lambda t: t[0]))
            ids.append(ex_id)
            labels.append(y)
            features.append(x)

        for i in range(5):
            file_path = join(path, 'f{}.jsonlines'.format(i))
            sub_features = []
            for example_num in range(num_examples):
                feat_num = i
                x = {
                    "f{}".format(feat_num):
                    features[example_num]["f{}".format(feat_num)]
                }
                sub_features.append(x)
            fs = FeatureSet('ablation_cv',
                            ids,
                            features=sub_features,
                            labels=labels)

            writer = NDJWriter(file_path, fs)
            writer.write()

        # now write out the last file which is basically
        # identical to the last featureset we wrote
        # except that it has two extra instances
        fs = FeatureSet(
            'extra',
            ids +
            ['cat{}'.format(num_examples), 'dog{}'.format(num_examples + 1)],
            features=sub_features + [{}, {}],
            labels=labels + ['cat', 'dog'])
        file_path = join(path, 'f5.jsonlines')
        writer = NDJWriter(file_path, fs)
        writer.write()

示例#5

0

显示文件

文件： utils.py 项目： latuji/skll

def make_sparse_data(use_feature_hashing=False):
    """
    Function to create sparse data with two features always zero
    in the training set and a different one always zero in the
    test set
    """
    # Create training data
    X, y = make_classification(n_samples=500, n_features=3,
                               n_informative=3, n_redundant=0,
                               n_classes=2, random_state=1234567890)

    # we need features to be non-negative since we will be
    # using naive bayes laster
    X = np.abs(X)

    # make sure that none of the features are zero
    X[np.where(X == 0)] += 1

    # since we want to use SKLL's FeatureSet class, we need to
    # create a list of IDs
    ids = ['EXAMPLE_{}'.format(n) for n in range(1, 501)]

    # create a list of dictionaries as the features
    # with f1 and f5 always 0
    feature_names = ['f{}'.format(n) for n in range(1, 6)]
    features = []
    for row in X:
        row = [0] + row.tolist() + [0]
        features.append(dict(zip(feature_names, row)))

    # use a FeatureHasher if we are asked to do feature hashing
    vectorizer = FeatureHasher(n_features=4) if use_feature_hashing else None
    train_fs = FeatureSet('train_sparse', ids,
                          features=features, labels=y,
                          vectorizer=vectorizer)

    # now create the test set with f4 always 0 but nothing else
    X, y = make_classification(n_samples=100, n_features=4,
                               n_informative=4, n_redundant=0,
                               n_classes=2, random_state=1234567890)
    X = np.abs(X)
    X[np.where(X == 0)] += 1
    ids = ['EXAMPLE_{}'.format(n) for n in range(1, 101)]

    # create a list of dictionaries as the features
    # with f4 always 0
    feature_names = ['f{}'.format(n) for n in range(1, 6)]
    features = []
    for row in X:
        row = row.tolist()
        row = row[:3] + [0] + row[3:]
        features.append(dict(zip(feature_names, row)))

    test_fs = FeatureSet('test_sparse', ids,
                         features=features, labels=y,
                         vectorizer=vectorizer)

    return train_fs, test_fs

示例#6

0

显示文件

def make_regression_data(num_examples=100,
                         train_test_ratio=0.5,
                         num_features=2,
                         sd_noise=1.0,
                         use_feature_hashing=False,
                         feature_bins=4,
                         start_feature_num=1,
                         random_state=1234567890):

    # use sklearn's make_regression to generate the data for us
    X, y, weights = make_regression(n_samples=num_examples,
                                    n_features=num_features,
                                    noise=sd_noise,
                                    random_state=random_state,
                                    coef=True)

    # since we want to use SKLL's FeatureSet class, we need to
    # create a list of IDs
    ids = ['EXAMPLE_{}'.format(n) for n in range(1, num_examples + 1)]

    # create a list of dictionaries as the features
    feature_names = [
        'f{:02d}'.format(n)
        for n in range(start_feature_num, start_feature_num + num_features)
    ]
    features = [dict(zip(feature_names, row)) for row in X]

    # convert the weights array into a dictionary for convenience
    weightdict = dict(zip(feature_names, weights))

    # split everything into training and testing portions
    num_train_examples = int(round(train_test_ratio * num_examples))
    train_features, test_features = (features[:num_train_examples],
                                     features[num_train_examples:])
    train_y, test_y = y[:num_train_examples], y[num_train_examples:]
    train_ids, test_ids = ids[:num_train_examples], ids[num_train_examples:]

    # create a FeatureHasher if we are asked to use feature hashing
    # with the specified number of feature bins
    vectorizer = (FeatureHasher(
        n_features=feature_bins) if use_feature_hashing else None)
    train_fs = FeatureSet('regression_train',
                          train_ids,
                          labels=train_y,
                          features=train_features,
                          vectorizer=vectorizer)
    test_fs = FeatureSet('regression_test',
                         test_ids,
                         labels=test_y,
                         features=test_features,
                         vectorizer=vectorizer)

    return (train_fs, test_fs, weightdict)

示例#7

0

显示文件

def test_featureset_creation_from_dataframe_with_string_labels():

    dftest = pd.DataFrame({
        "id": [1, 2],
        "score": ['yes', 'no'],
        "text": ["a b", "b c"]
    })
    dftest.set_index("id", inplace=True)
    test_feat_dict_list = [{'a': 1.0, 'b': 1.0}, {'b': 1.0, 'c': 1.0}]
    test_dict_vectorizer = DictVectorizer()
    Xtest = test_dict_vectorizer.fit_transform(test_feat_dict_list)
    fs_test = FeatureSet('test',
                         ids=dftest.index.values,
                         labels=dftest['score'].values,
                         features=Xtest,
                         vectorizer=test_dict_vectorizer)

    output_path = join(_my_dir, "other", "test_string_labels_df.jsonlines")
    test_writer = NDJWriter(output_path, fs_test)
    test_writer.write()

    # read in the written file into a featureset and confirm that the
    # two featuresets are equal
    fs_test2 = NDJReader.for_path(output_path, ids_to_floats=True).read()

    assert fs_test == fs_test2

示例#8

0

显示文件

def test_mismatch_labels_features():
    """
    Test to catch mistmatch between the shape of the labels vector and the feature matrix
    """

    # get a 100 instances with 4 features but ignore the labels we
    # get from here
    X, y = make_classification(n_samples=100,
                               n_features=4,
                               n_informative=4,
                               n_redundant=0,
                               n_classes=3,
                               random_state=1234567890)

    # double-stack y to ensure we don't match the number of feature rows
    y2 = np.hstack([y, y])

    # convert the features into a list of dictionaries
    feature_names = ['f{}'.format(n) for n in range(1, 5)]
    features = []
    for row in X:
        features.append(dict(zip(feature_names, row)))

    # get 100 ids
    ids = ['EXAMPLE_{}'.format(i) for i in range(100)]

    # This should raise a ValueError
    FeatureSet('test', ids, features=features, labels=y2)

示例#9

0

显示文件

def check_dummy_classifier_predict(model_args, train_labels, expected_output):

    # create hard-coded featuresets based with known labels
    train_fs = FeatureSet('classification_train',
                          ['TrainExample{}'.format(i) for i in range(20)],
                          labels=train_labels,
                          features=[{"feature": i} for i in range(20)])

    test_fs = FeatureSet('classification_test',
                         ['TestExample{}'.format(i) for i in range(10)],
                         features=[{"feature": i} for i in range(20, 30)])

    # Ensure predictions are as expectedfor the given strategy
    learner = Learner('DummyClassifier', model_kwargs=model_args)
    learner.train(train_fs, grid_search=False)
    predictions = learner.predict(test_fs)
    eq_(np.array_equal(expected_output, predictions), True)

示例#10

0

显示文件

文件： test_cv.py 项目： monkidea/skll

def make_cv_folds_data(num_examples_per_fold=100,
                       num_folds=3,
                       use_feature_hashing=False):
    """
    Create data for pre-specified CV folds tests
    with or without feature hashing
    """

    num_total_examples = num_examples_per_fold * num_folds

    # create the numeric features and the binary labels
    X, _ = make_classification(n_samples=num_total_examples,
                               n_features=3,
                               n_informative=3,
                               n_redundant=0,
                               n_classes=2,
                               random_state=1234567890)
    y = np.array([0, 1] * int(num_total_examples / 2))

    # the folds mapping: the first num_examples_per_fold examples
    # are in fold 1 the second num_examples_per_fold are in
    # fold 2 and so on
    foldgen = ([str(i)] * num_examples_per_fold for i in range(num_folds))
    folds = list(itertools.chain(*foldgen))

    # now create the list of feature dictionaries
    # and add the binary features that depend on
    # the class and fold number
    feature_names = ['f{}'.format(i) for i in range(1, 4)]
    features = []
    for row, classid, foldnum in zip(X, y, folds):
        string_feature_name = 'is_{}_{}'.format(classid, foldnum)
        string_feature_value = 1
        feat_dict = dict(zip(feature_names, row))
        feat_dict.update({string_feature_name: string_feature_value})
        features.append(feat_dict)

    # create the example IDs
    ids = [
        'EXAMPLE_{}'.format(num_examples_per_fold * k + i)
        for k in range(num_folds) for i in range(num_examples_per_fold)
    ]

    # create the cross-validation feature set with or without feature hashing
    vectorizer = FeatureHasher(n_features=4) if use_feature_hashing else None
    cv_fs = FeatureSet('cv_folds',
                       ids,
                       features=features,
                       labels=y,
                       vectorizer=vectorizer)

    # make the custom cv folds dictionary
    custom_cv_folds = dict(zip(ids, folds))

    return (cv_fs, custom_cv_folds)

示例#11

0

显示文件

def test_feature_merging_order_invariance():
    """
    Test whether featuresets with different orders of IDs can be merged
    """

    # First, randomly generate two feature sets and then make sure they have
    # the same labels.
    train_fs1, _, _ = make_regression_data()
    train_fs2, _, _ = make_regression_data(start_feature_num=3,
                                           random_state=87654321)
    train_fs2.labels = train_fs1.labels.copy()

    # make a reversed copy of feature set 2
    shuffled_indices = list(range(len(train_fs2.ids)))
    np.random.seed(123456789)
    np.random.shuffle(shuffled_indices)
    train_fs2_ids_shuf = train_fs2.ids[shuffled_indices]
    train_fs2_labels_shuf = train_fs2.labels[shuffled_indices]
    train_fs2_features_shuf = train_fs2.features[shuffled_indices]
    train_fs2_shuf = FeatureSet("f2_shuf",
                                train_fs2_ids_shuf,
                                labels=train_fs2_labels_shuf,
                                features=train_fs2_features_shuf,
                                vectorizer=train_fs2.vectorizer)

    # merge feature set 1 with feature set 2 and its reversed version
    merged_fs = train_fs1 + train_fs2
    merged_fs_shuf = train_fs1 + train_fs2_shuf

    # check that the two merged versions are the same
    feature_names = (train_fs1.vectorizer.get_feature_names() +
                     train_fs2.vectorizer.get_feature_names())
    assert_array_equal(merged_fs.vectorizer.get_feature_names(), feature_names)
    assert_array_equal(merged_fs_shuf.vectorizer.get_feature_names(),
                       feature_names)

    assert_array_equal(merged_fs.labels, train_fs1.labels)
    assert_array_equal(merged_fs.labels, train_fs2.labels)
    assert_array_equal(merged_fs.labels, merged_fs_shuf.labels)

    assert_array_equal(merged_fs.ids, train_fs1.ids)
    assert_array_equal(merged_fs.ids, train_fs2.ids)
    assert_array_equal(merged_fs.ids, merged_fs_shuf.ids)

    assert_array_equal(merged_fs.features[:, 0:2].todense(),
                       train_fs1.features.todense())
    assert_array_equal(merged_fs.features[:, 2:4].todense(),
                       train_fs2.features.todense())
    assert_array_equal(merged_fs.features.todense(),
                       merged_fs_shuf.features.todense())

    assert not np.all(
        merged_fs.features[:,
                           0:2].todense() == merged_fs.features[:,
                                                                2:4].todense())

示例#12

0

显示文件

文件： test_preprocessing.py 项目： m7142yosuke/skll

def make_class_map_data():
    # Create training file
    train_path = join(_my_dir, 'train', 'test_class_map.jsonlines')
    ids = []
    labels = []
    features = []
    class_names = ['beagle', 'cat', 'dachsund', 'cat']
    for i in range(1, 101):
        y = class_names[i % 4]
        ex_id = "{}{}".format(y, i)
        # note that f1 and f5 are missing in all instances but f4 is not
        x = {"f2": i + 1, "f3": i + 2, "f4": i + 5}
        ids.append(ex_id)
        labels.append(y)
        features.append(x)
    train_fs = FeatureSet('train_class_map',
                          ids,
                          features=features,
                          labels=labels)
    writer = NDJWriter(train_path, train_fs)
    writer.write()

    # Create test file
    test_path = join(_my_dir, 'test', 'test_class_map.jsonlines')
    ids = []
    labels = []
    features = []
    for i in range(1, 51):
        y = class_names[i % 4]
        ex_id = "{}{}".format(y, i)
        # f1 and f5 are not missing in any instances here but f4 is
        x = {"f1": i, "f2": i + 2, "f3": i % 10, "f5": i * 2}
        ids.append(ex_id)
        labels.append(y)
        features.append(x)
    test_fs = FeatureSet('test_class_map',
                         ids,
                         features=features,
                         labels=labels)
    writer = NDJWriter(test_path, test_fs)
    writer.write()

示例#13

0

显示文件

    def read(self):
        """
        Read examples from list of dictionaries.

        Returns
        -------
        feature_set : skll.FeatureSet
            FeatureSet representing the list of dictionaries we read in.
        """
        ids = []
        labels = []
        feat_dicts = []
        for example_num, example in enumerate(self.path_or_list):
            curr_id = str(example.get("id",
                                      "EXAMPLE_{}".format(example_num)))
            if self.ids_to_floats:
                try:
                    curr_id = float(curr_id)
                except ValueError:
                    raise ValueError(('You set ids_to_floats to true,' +
                                      ' but ID {} could not be ' +
                                      'converted to float in ' +
                                      '{}').format(curr_id, example))
            class_name = (safe_float(example['y'],
                                     replace_dict=self.class_map)
                          if 'y' in example else None)
            example = example['x']

            # Update lists of IDs, labels, and feature dictionaries
            if self.ids_to_floats:
                try:
                    curr_id = float(curr_id)
                except ValueError:
                    raise ValueError(('You set ids_to_floats to true, but ID '
                                      '{} could not be converted to float in '
                                      '{}').format(curr_id, self.path_or_list))
            ids.append(curr_id)
            labels.append(class_name)
            feat_dicts.append(example)

            # Print out status
            if example_num % 100 == 0:
                self._print_progress(example_num)

        # Convert lists to numpy arrays
        ids = np.array(ids)
        labels = np.array(labels)
        features = self.vectorizer.fit_transform(feat_dicts)

        return FeatureSet('converted', ids, labels=labels,
                          features=features, vectorizer=self.vectorizer)

示例#14

0

显示文件

文件： test_classification.py 项目： saadmahboob/skll

def test_dummy_classifier_predict():
    # hard-code dataset
    train_fs = FeatureSet('classification_train',
                          ['TrainExample{}'.format(i) for i in range(20)],
                          labels=([0] * 14) + ([1] * 6),
                          features=[{
                              "feature": i
                          } for i in range(20)])

    test_fs = FeatureSet('classification_test',
                         ['TestExample{}'.format(i) for i in range(10)],
                         features=[{
                             "feature": i
                         } for i in range(20, 30)])

    toy_data = ([{
        "strategy": "stratified",
        "random_state": 12345
    },
                 np.array([1, 0, 0, 0, 0, 0, 1, 0, 1,
                           0])], [{
                               "strategy": "most_frequent"
                           },
                                  np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])],
                [{
                    "strategy": "constant",
                    "constant": 1
                },
                 np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])])

    # Ensure predictions are correct for all strategies.
    correct = []
    for model_args, expected_output in toy_data:
        learner = Learner('DummyClassifier', model_kwargs=model_args)
        learner.train(train_fs)
        predictions = learner.predict(test_fs)
        correct.append(np.array_equal(expected_output, predictions))
    eq_(correct, [True, True, True])

示例#15

0

显示文件

文件： test_classification.py 项目： ofergold/skll

def make_float_class_data():
    """
    We want to create data that has labels that look like
    floats to make sure they are preserved correctly
    """

    ids = ['EXAMPLE_{}'.format(n) for n in range(1, 76)]
    y = [1.2] * 25 + [1.5] * 25 + [1.8] * 25
    X = np.vstack([np.identity(25), np.identity(25), np.identity(25)])
    feature_names = ['f{}'.format(i) for i in range(1, 6)]
    features = []
    for row in X:
        features.append(dict(zip(feature_names, row)))

    return FeatureSet('float-classes', ids, features=features, labels=y)

示例#16

0

显示文件

def test_learning_curve_implementation():
    """
    Test to ensure that the learning curve results match scikit-learn
    """

    # This test is different from the other tests which just use regression data.
    # The reason is that we want this test to fail in case our implementation
    # diverges from the scikit-learn implementation. This test essentially
    # serves as a regression test as well.

    # Load in the digits data set
    digits = load_digits()
    X, y = digits.data, digits.target

    # get the learning curve results from scikit-learn for this data
    cv_folds = 10
    random_state = 123456789
    cv = ShuffleSplit(n_splits=cv_folds, test_size=0.2, random_state=random_state)
    estimator = MultinomialNB()
    train_sizes = np.linspace(.1, 1.0, 5)
    train_sizes1, train_scores1, test_scores1 = learning_curve(estimator,
                                                               X,
                                                               y,
                                                               cv=cv,
                                                               train_sizes=train_sizes,
                                                               scoring='accuracy')

    # get the features from this data into a FeatureSet instance we can use
    # with the SKLL API
    feature_names = ['f{:02}'.format(n) for n in range(X.shape[1])]
    features = []
    for row in X:
        features.append(dict(zip(feature_names, row)))
    fs = FeatureSet('train', features=features, labels=y, ids=list(range(X.shape[0])))

    # we don't want to filter out any features since scikit-learn
    # does not do that either
    learner = Learner('MultinomialNB', min_feature_count=0)
    (train_scores2,
     test_scores2,
     train_sizes2) = learner.learning_curve(fs,
                                            cv_folds=cv_folds,
                                            train_sizes=train_sizes,
                                            metric='accuracy')

    assert np.all(train_sizes1 == train_sizes2)
    assert np.allclose(train_scores1, train_scores2)
    assert np.allclose(test_scores1, test_scores2)

示例#17

0

显示文件

def make_rare_class_data():
    """
    We want to create data that has five instances per class, for three labels
    and for each instance within the group of 5, there's only a single feature
    firing
    """

    ids = ['EXAMPLE_{}'.format(n) for n in range(1, 16)]
    y = [0] * 5 + [1] * 5 + [2] * 5
    X = np.vstack([np.identity(5), np.identity(5), np.identity(5)])
    feature_names = ['f{}'.format(i) for i in range(1, 6)]
    features = []
    for row in X:
        features.append(dict(zip(feature_names, row)))

    return FeatureSet('rare-class', ids, features=features, labels=y)

示例#18

0

显示文件

    def read(self):
        """
        Loads examples in the `.arff`, `.csv`, `.jsonlines`, `.libsvm`,
        `.ndj`, or `.tsv` formats.

        Returns
        -------
        feature_set : skll.FeatureSet
            ``FeatureSet`` instance representing the input file.

        Raises
        ------
        ValueError
            If ``ids_to_floats`` is True, but IDs cannot be converted.
        ValueError
            If no features are found.
        ValueError
            If the example IDs are not unique.
        """
        self.logger.debug('Path: %s', self.path_or_list)

        if not self.quiet:
            self._progress_msg = "Loading {}...".format(self.path_or_list)
            print(self._progress_msg, end="\r", file=sys.stderr)
            sys.stderr.flush()

        if self._use_pandas:
            ids, labels, features = self._sub_read(self.path_or_list)
        else:
            ids, labels, features = self._sub_read_rows(self.path_or_list)

        # Convert everything to numpy arrays
        features = self.vectorizer.fit_transform(features)

        # Report that loading is complete
        self._print_progress("done", end="\n")

        # Make sure we have the same number of ids, labels, and features
        assert ids.shape[0] == labels.shape[0] == features.shape[0]

        if ids.shape[0] != len(set(ids)):
            raise ValueError('The example IDs are not unique in %s.' %
                             self.path_or_list)

        return FeatureSet(self.path_or_list, ids, labels=labels,
                          features=features, vectorizer=self.vectorizer)

示例#19

0

显示文件

def create_jsonlines_feature_files(path):

    # we only need to create the feature files if they
    # don't already exist under the given path
    feature_files_to_create = [
        join(path, 'f{}.jsonlines'.format(i)) for i in range(5)
    ]
    if all([exists(ff) for ff in feature_files_to_create]):
        return
    else:
        num_examples = 1000
        np.random.seed(1234567890)

        # Create lists we will write files from
        ids = []
        features = []
        labels = []
        for j in range(num_examples):
            y = "dog" if j % 2 == 0 else "cat"
            ex_id = "{}{}".format(y, j)
            x = {
                "f{}".format(feat_num): np.random.randint(0, 4)
                for feat_num in range(5)
            }
            x = OrderedDict(sorted(x.items(), key=lambda t: t[0]))
            ids.append(ex_id)
            labels.append(y)
            features.append(x)

        for i in range(5):
            file_path = join(path, 'f{}.jsonlines'.format(i))
            sub_features = []
            for example_num in range(num_examples):
                feat_num = i
                x = {
                    "f{}".format(feat_num):
                    features[example_num]["f{}".format(feat_num)]
                }
                sub_features.append(x)
            fs = FeatureSet('ablation_cv',
                            ids,
                            features=sub_features,
                            labels=labels)
            writer = NDJWriter(file_path, fs)
            writer.write()

示例#20

0

显示文件

def test_writing_ndj_featureset_with_string_ids():
    test_dict_vectorizer = DictVectorizer()
    test_feat_dict_list = [{'a': 1.0, 'b': 1.0}, {'b': 1.0, 'c': 1.0}]
    Xtest = test_dict_vectorizer.fit_transform(test_feat_dict_list)
    fs_test = FeatureSet('test',
                         ids=['1', '2'],
                         labels=[1, 2],
                         features=Xtest,
                         vectorizer=test_dict_vectorizer)
    output_path = join(_my_dir, "other", "test_string_ids.jsonlines")
    test_writer = NDJWriter(output_path, fs_test)
    test_writer.write()

    # read in the written file into a featureset and confirm that the
    # two featuresets are equal
    fs_test2 = NDJReader.for_path(output_path).read()

    assert fs_test == fs_test2

示例#21

0

显示文件

文件： test_featureset.py 项目： monkidea/skll

def test_empty_ids():
    """
    Test to ensure that an error is raised if ids is None
    """

    # get a 100 instances with 4 features each
    X, y = make_classification(n_samples=100, n_features=4,
                               n_informative=4, n_redundant=0,
                               n_classes=3, random_state=1234567890)

    # convert the features into a list of dictionaries
    feature_names = ['f{}'.format(n) for n in range(1, 5)]
    features = []
    for row in X:
        features.append(dict(zip(feature_names, row)))

    # create a feature set with ids set to None and raise ValueError
    FeatureSet('test', None, features=features, labels=y)

示例#22

0

显示文件

def make_ablation_data():
    # Remove old CV data
    for old_file in glob.glob(join(_my_dir, 'output',
                                   'ablation_cv_*.results')):
        os.remove(old_file)

    num_examples = 1000

    np.random.seed(1234567890)

    # Create lists we will write files from
    ids = []
    features = []
    labels = []
    for j in range(num_examples):
        y = "dog" if j % 2 == 0 else "cat"
        ex_id = "{}{}".format(y, j)
        x = {
            "f{}".format(feat_num): np.random.randint(0, 4)
            for feat_num in range(5)
        }
        x = OrderedDict(sorted(x.items(), key=lambda t: t[0]))
        ids.append(ex_id)
        labels.append(y)
        features.append(x)

    for i in range(5):
        train_path = join(_my_dir, 'train', 'f{}.jsonlines'.format(i))
        sub_features = []
        for example_num in range(num_examples):
            feat_num = i
            x = {
                "f{}".format(feat_num):
                features[example_num]["f{}".format(feat_num)]
            }
            sub_features.append(x)
        train_fs = FeatureSet('ablation_cv',
                              ids,
                              features=sub_features,
                              labels=labels)
        writer = NDJWriter(train_path, train_fs)
        writer.write()

示例#23

0

显示文件

文件： test_featureset.py 项目： monkidea/skll

def test_mismatch_ids_features():
    """
    Test to catch mistmatch between the shape of the ids vector and the feature matrix
    """

    # get a 100 instances with 4 features each
    X, y = make_classification(n_samples=100, n_features=4,
                               n_informative=4, n_redundant=0,
                               n_classes=3, random_state=1234567890)

    # convert the features into a list of dictionaries
    feature_names = ['f{}'.format(n) for n in range(1, 5)]
    features = []
    for row in X:
        features.append(dict(zip(feature_names, row)))

    # get 200 ids since we don't want to match the number of feature rows
    ids = ['EXAMPLE_{}'.format(i) for i in range(200)]

    # This should raise a ValueError
    FeatureSet('test', ids, features=features, labels=y)

示例#24

0

显示文件

def make_merging_data(num_feat_files, suffix, numeric_ids):
    num_examples = 500
    num_feats_per_file = 17

    np.random.seed(1234567890)

    merge_dir = join(_my_dir, 'train', 'test_merging')
    if not exists(merge_dir):
        os.makedirs(merge_dir)

    # Create lists we will write files from
    ids = []
    features = []
    labels = []
    for j in range(num_examples):
        y = "dog" if j % 2 == 0 else "cat"
        ex_id = "{}{}".format(y, j) if not numeric_ids else j
        x = {
            "f{:03d}".format(feat_num): np.random.randint(0, 4)
            for feat_num in range(num_feat_files * num_feats_per_file)
        }
        x = OrderedDict(sorted(x.items(), key=lambda t: t[0]))
        ids.append(ex_id)
        labels.append(y)
        features.append(x)

    # Unmerged
    subset_dict = {}
    for i in range(num_feat_files):
        feat_num = i * num_feats_per_file
        subset_dict['{}'.format(i)] = [
            "f{:03d}".format(feat_num + j) for j in range(num_feats_per_file)
        ]
    train_path = join(merge_dir, suffix)
    train_fs = FeatureSet('train', ids, labels=labels, features=features)
    Writer.for_path(train_path, train_fs, subsets=subset_dict).write()

    # Merged
    train_path = join(merge_dir, 'all{}'.format(suffix))
    Writer.for_path(train_path, train_fs).write()

示例#25

0

显示文件

def make_regression_data(num_examples=100,
                         train_test_ratio=0.5,
                         num_features=2,
                         sd_noise=1.0,
                         use_feature_hashing=False,
                         feature_bins=4,
                         start_feature_num=1,
                         random_state=1234567890):

    # if we are doing feature hashing and we have asked for more
    # feature bins than number of total features, we need to
    # handle that because `make_regression()` doesn't know
    # about hashing
    if use_feature_hashing and num_features < feature_bins:
        num_features = feature_bins

    # use sklearn's make_regression to generate the data for us
    X, y, weights = make_regression(n_samples=num_examples,
                                    n_features=num_features,
                                    noise=sd_noise,
                                    random_state=random_state,
                                    coef=True)

    # since we want to use SKLL's FeatureSet class, we need to
    # create a list of IDs
    ids = ['EXAMPLE_{}'.format(n) for n in range(1, num_examples + 1)]

    # create a list of dictionaries as the features
    index_width_for_feature_name = int(floor(log10(num_features))) + 1
    feature_names = []
    for n in range(start_feature_num, start_feature_num + num_features):
        index_str = str(n).zfill(index_width_for_feature_name)
        feature_name = 'f{}'.format(index_str)
        feature_names.append(feature_name)
    features = [dict(zip(feature_names, row)) for row in X]

    # At this point the labels are generated using unhashed features
    # even if we want to do feature hashing. `make_regression()` from
    # sklearn doesn't know anything about feature hashing, so we need
    # a hack here to compute the updated labels ourselves
    # using the same command that sklearn uses inside `make_regression()`
    # which is to generate the X and the weights and then compute the
    # y as the dot product of the two. This y will then be used as our
    # labels instead of the original y we got from `make_regression()`.
    # Note that we only want to use the number of weights that are
    # equal to the number of feature bins for the hashing
    if use_feature_hashing:
        feature_hasher = FeatureHasher(n_features=feature_bins)
        hashed_X = feature_hasher.fit_transform(features)
        y = hashed_X.dot(weights[:feature_bins])

    # convert the weights array into a dictionary for convenience
    # if we are using feature hashing, we need to use the names
    # that would be output by `model_params()` instead of the
    # original names since that's what we would get from SKLL
    if use_feature_hashing:
        index_width_for_feature_name = int(floor(log10(feature_bins))) + 1
        hashed_feature_names = []
        for i in range(feature_bins):
            index_str = str(i + 1).zfill(index_width_for_feature_name)
            feature_name = 'hashed_feature_{}'.format(index_str)
            hashed_feature_names.append(feature_name)
        weightdict = dict(zip(hashed_feature_names, weights[:feature_bins]))
    else:
        weightdict = dict(zip(feature_names, weights))

    # split everything into training and testing portions
    num_train_examples = int(round(train_test_ratio * num_examples))
    train_features, test_features = (features[:num_train_examples],
                                     features[num_train_examples:])
    train_y, test_y = y[:num_train_examples], y[num_train_examples:]
    train_ids, test_ids = ids[:num_train_examples], ids[num_train_examples:]

    # create a FeatureHasher if we are asked to use feature hashing
    # with the specified number of feature bins
    vectorizer = (FeatureHasher(
        n_features=feature_bins) if use_feature_hashing else None)
    train_fs = FeatureSet('regression_train',
                          train_ids,
                          labels=train_y,
                          features=train_features,
                          vectorizer=vectorizer)
    test_fs = FeatureSet('regression_test',
                         test_ids,
                         labels=test_y,
                         features=test_features,
                         vectorizer=vectorizer)

    return (train_fs, test_fs, weightdict)

示例#26

0

显示文件

def make_classification_data(num_examples=100,
                             train_test_ratio=0.5,
                             num_features=10,
                             use_feature_hashing=False,
                             feature_bins=4,
                             num_labels=2,
                             empty_labels=False,
                             string_label_list=None,
                             feature_prefix='f',
                             id_type='string',
                             class_weights=None,
                             non_negative=False,
                             one_string_feature=False,
                             num_string_values=4,
                             random_state=1234567890):

    # use sklearn's make_classification to generate the data for us
    num_numeric_features = (num_features -
                            1 if one_string_feature else num_features)
    X, y = make_classification(n_samples=num_examples,
                               n_features=num_numeric_features,
                               n_informative=num_numeric_features,
                               n_redundant=0,
                               n_classes=num_labels,
                               weights=class_weights,
                               random_state=random_state)

    if string_label_list:
        assert (len(string_label_list) == num_labels)
        label_to_string = np.vectorize(lambda n: string_label_list[n])
        y = label_to_string(y)

    # if we were told to only generate non-negative features, then
    # we can simply take the absolute values of the generated features
    if non_negative:
        X = abs(X)

    # since we want to use SKLL's FeatureSet class, we need to
    # create a list of IDs; we create IDs that either can also
    # be numbers or pure strings
    if id_type == 'string':
        ids = ['EXAMPLE_{}'.format(n) for n in range(1, num_examples + 1)]
    elif id_type == 'integer_string':
        ids = ['{}'.format(n) for n in range(1, num_examples + 1)]
    elif id_type == 'float':
        ids = [float(n) for n in range(1, num_examples + 1)]
    elif id_type == 'integer':
        ids = list(range(1, num_examples + 1))

    # create a string feature that has four possible values
    # 'a', 'b', 'c' and 'd' and add it to X at the end
    if one_string_feature:
        prng = RandomState(random_state)
        random_indices = prng.random_integers(0, num_string_values - 1,
                                              num_examples)
        possible_values = [chr(x) for x in range(97, 97 + num_string_values)]
        string_feature_values = [possible_values[i] for i in random_indices]
        string_feature_column = np.array(string_feature_values,
                                         dtype=object).reshape(100, 1)
        X = np.append(X, string_feature_column, 1)

    # create a list of dictionaries as the features
    feature_names = [
        '{}{:02d}'.format(feature_prefix, n)
        for n in range(1, num_features + 1)
    ]
    features = [dict(zip(feature_names, row)) for row in X]

    # split everything into training and testing portions
    num_train_examples = int(round(train_test_ratio * num_examples))
    train_features, test_features = (features[:num_train_examples],
                                     features[num_train_examples:])
    train_y, test_y = y[:num_train_examples], y[num_train_examples:]
    train_ids, test_ids = ids[:num_train_examples], ids[num_train_examples:]

    # are we told to generate empty labels
    train_labels = None if empty_labels else train_y
    test_labels = None if empty_labels else test_y

    # create a FeatureHasher if we are asked to use feature hashing
    # with the specified number of feature bins
    vectorizer = (FeatureHasher(
        n_features=feature_bins) if use_feature_hashing else None)
    train_fs = FeatureSet('classification_train',
                          train_ids,
                          labels=train_labels,
                          features=train_features,
                          vectorizer=vectorizer)
    if train_test_ratio < 1.0:
        test_fs = FeatureSet('classification_test',
                             test_ids,
                             labels=test_labels,
                             features=test_features,
                             vectorizer=vectorizer)
    else:
        test_fs = None

    return (train_fs, test_fs)

示例#27

0

显示文件

def test_pipeline_attribute():

    # define the classifier and regressor feature dictionaries and labels that we will test on
    # and also the classes and targets respectively
    cfeature_dicts = [{"f01": -2.87, "f02": 0.713, "f03": 2.86, "f04": 0.385, "f05": -0.989,
                       "f06": 0.380, "f07": -0.365, "f08": -0.224, "f09": 3.45, "f10": 0.622},
                      {"f01": 0.058, "f02": -1.14, "f03": 2.85, "f04": 1.41, "f05": 1.60,
                       "f06": 1.04, "f07": -0.669, "f08": -0.727, "f09": 1.82, "f10": 1.336},
                      {"f01": -1.80, "f02": 3.21, "f03": 0.79, "f04": -0.55, "f05": 0.059,
                       "f06": -5.66, "f07": -3.08, "f08": -0.95, "f09": 0.188, "f10": -1.24},
                      {"f01": 2.270, "f02": 2.271, "f03": 2.285, "f04": 2.951, "f05": 1.018,
                       "f06": -0.59, "f07": 0.432, "f08": 1.614, "f09": -0.69, "f10": -1.27},
                      {"f01": 2.98, "f02": 3.74, "f03": 1.96, "f04": 0.80, "f05": 0.425,
                       "f06": -0.76, "f07": 4.013, "f08": 3.119, "f09": 2.104, "f10": 0.195},
                      {"f01": 2.560, "f02": -2.05, "f03": 1.793, "f04": 0.955, "f05": 2.914,
                       "f06": 2.239, "f07": -1.41, "f08": -1.24, "f09": -4.44, "f10": 0.273},
                      {"f01": 1.86, "f02": -0.017, "f03": 1.337, "f04": -2.14, "f05": 2.255,
                       "f06": -1.21, "f07": -0.24, "f08": -0.66, "f09": -2.51, "f10": -1.06},
                      {"f01": -1.95, "f02": -1.81, "f03": 2.105, "f04": 0.976, "f05": -1.480,
                       "f06": 1.120, "f07": -1.22, "f08": 0.704, "f09": -3.66, "f10": -1.72},
                      {"f01": -1.54, "f02": -2.17, "f03": -4.18, "f04": 1.708, "f05": 0.514,
                       "f06": 0.354, "f07": -3.55, "f08": 2.285, "f09": -3.47, "f10": -0.79},
                      {"f01": 2.162, "f02": -0.71, "f03": -0.448, "f04": 0.326, "f05": 3.384,
                      "f06": -0.455, "f07": 1.253, "f08": 0.998, "f09": 3.193, "f10": 1.342}]
    classes = [1, 1, 0, 2, 1, 2, 0, 1, 2, 1]

    rfeature_dicts = [{'f1': 1.351, 'f2': -0.117, 'f3': 0.570, 'f4': 0.0619,
                       'f5': 1.569, 'f6': 0.805},
                      {'f1': -0.557, 'f2': -1.704, 'f3': 0.0913, 'f4': 0.767,
                       'f5': 1.281, 'f6': -0.803},
                      {'f1': 0.720, 'f2': -0.268, 'f3': 0.760, 'f4': 0.861,
                      'f5': -0.403, 'f6': 0.814},
                      {'f1': 1.737, 'f2': -0.228, 'f3': 1.340, 'f4': 2.031,
                      'f5': 2.170, 'f6': 1.498},
                      {'f1': 0.344, 'f2': 0.340, 'f3': 0.572, 'f4': -1.06,
                       'f5': 1.044, 'f6': 2.065},
                      {'f1': -0.489, 'f2': -0.420, 'f3': 0.428, 'f4': 0.707,
                       'f5': -1.306, 'f6': 0.0081},
                      {'f1': 0.805, 'f2': 0.570, 'f3': 1.351, 'f4': -0.117,
                       'f5': 0.0619, 'f6': 1.569},
                      {'f1': -1.083, 'f2': 0.0369, 'f3': -0.413, 'f4': 1.391,
                       'f5': 1.417, 'f6': -1.118},
                      {'f1': -1.945, 'f2': -0.332, 'f3': -1.393, 'f4': 0.952,
                       'f5': -0.816, 'f6': 1.417},
                      {'f1': 1.976, 'f2': -0.220, 'f3': -1.636, 'f4': 0.795,
                       'f5': -2.34, 'f6': -0.148}]
    targets = [96.057, -176.017, -182.32, -56.46, -50.14, -84.53, 241.71, -17.84,
               -47.09, 77.65]

    # create training featuresets that we will use to train our estimator
    function_args_dict = defaultdict(dict)
    for estimator_type in ['classifier', 'regressor']:
        for do_feature_hashing in [True, False]:
            if estimator_type == 'classifier':
                (train_fs, test_fs) = make_classification_data(num_examples=500,
                                                               num_features=10,
                                                               num_labels=3,
                                                               feature_bins=4,
                                                               non_negative=True,
                                                               use_feature_hashing=do_feature_hashing)
                labels = classes
                feature_dicts = cfeature_dicts
            else:
                (train_fs, test_fs, _) = make_regression_data(num_examples=500,
                                                              num_features=6,
                                                              feature_bins=4,
                                                              use_feature_hashing=do_feature_hashing)
                labels = targets
                feature_dicts = rfeature_dicts

            # if we are doing feature hashing, we need to transform our test
            # cases to the same space. If we are not, then we don't need to worry
            # beacuse we have manually ensured that the number of features are the
            # same for the non-hashing case (10 for classification, and 6 for
            # regression)
            test_fs = FeatureSet('test',
                                 ids=list(range(1, 11)),
                                 features=feature_dicts,
                                 labels=labels,
                                 vectorizer=train_fs.vectorizer if do_feature_hashing else None)
            function_args_dict[estimator_type][do_feature_hashing] = [train_fs,
                                                                      test_fs,
                                                                      feature_dicts,
                                                                      labels]
    function_args_dict = dict(function_args_dict)

    # now set up the test cases
    learners = ['LinearSVC', 'LogisticRegression',
                'MultinomialNB', 'SVC',
                'GradientBoostingClassifier', 'Lars',
                'LinearSVR', 'Ridge', 'SVR',
                'GradientBoostingRegressor']
    use_hashing = [True, False]
    min_feature_counts = [1, 2]
    samplers = [None, 'RBFSampler', 'SkewedChi2Sampler']
    scalers = ['none', 'with_mean', 'with_std', 'both']

    for (learner_name,
         do_feature_hashing,
         min_count,
         scaling_type,
         sampler_name) in product(learners,
                                  use_hashing,
                                  min_feature_counts,
                                  scalers,
                                  samplers):

        # skip the case for MultinomialNB with feature hashing
        # or feature sampling since it does not support those
        if learner_name == 'MultinomialNB':
            if do_feature_hashing or sampler_name is not None:
                continue

        # if we are using a SkewedChi2Sampler, we need to set the
        # some parameters to make sure it works as expected
        if sampler_name == 'SkewedChi2Sampler':
            sampler_kwargs = {'skewedness': 15, 'n_components': 10}
        else:
            sampler_kwargs = {}

        # create a learner instance with the given parameters
        # and with pipeline attribute set to True
        learner = Learner(learner_name,
                          min_feature_count=min_count,
                          sampler=sampler_name,
                          sampler_kwargs=sampler_kwargs,
                          feature_scaling=scaling_type,
                          pipeline=True)

        yield (check_pipeline_attribute,
               learner_name,
               do_feature_hashing,
               min_count,
               scaling_type,
               sampler_name,
               learner,
               function_args_dict)

示例#28

0

显示文件

文件： test_featureset.py 项目： nimmen/skll

def make_conversion_data(num_feat_files, from_suffix, to_suffix):
    num_examples = 500
    num_feats_per_file = 7

    np.random.seed(1234567890)

    convert_dir = join(_my_dir, 'train', 'test_conversion')
    if not exists(convert_dir):
        os.makedirs(convert_dir)

    # Create lists we will write files from
    ids = []
    features = []
    labels = []
    for j in range(num_examples):
        y = "dog" if j % 2 == 0 else "cat"
        ex_id = "{}{}".format(y, j)
        x = {
            "f{:03d}".format(feat_num): np.random.randint(0, 4)
            for feat_num in range(num_feat_files * num_feats_per_file)
        }
        x = OrderedDict(sorted(x.items(), key=lambda t: t[0]))
        ids.append(ex_id)
        labels.append(y)
        features.append(x)
    # Create vectorizers/maps for libsvm subset writing
    feat_vectorizer = DictVectorizer()
    feat_vectorizer.fit(features)
    label_map = {
        label: num
        for num, label in enumerate(
            sorted({
                label
                for label in labels if not isinstance(label, (int, float))
            }))
    }
    # Add fake item to vectorizer for None
    label_map[None] = '00000'

    # get the feature name prefix
    feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'),
                                            to_suffix.lstrip('.'))

    # Write out unmerged features in the `from_suffix` file format
    for i in range(num_feat_files):
        train_path = join(
            convert_dir, '{}_{}{}'.format(feature_name_prefix, i, from_suffix))
        sub_features = []
        for example_num in range(num_examples):
            feat_num = i * num_feats_per_file
            x = {
                "f{:03d}".format(feat_num + j):
                features[example_num]["f{:03d}".format(feat_num + j)]
                for j in range(num_feats_per_file)
            }
            sub_features.append(x)
        train_fs = FeatureSet('sub_train',
                              ids,
                              labels=labels,
                              features=sub_features,
                              vectorizer=feat_vectorizer)
        if from_suffix == '.libsvm':
            Writer.for_path(train_path, train_fs, label_map=label_map).write()
        else:
            Writer.for_path(train_path, train_fs).write()

    # Write out the merged features in the `to_suffix` file format
    train_path = join(convert_dir, '{}_all{}'.format(feature_name_prefix,
                                                     to_suffix))
    train_fs = FeatureSet('train',
                          ids,
                          labels=labels,
                          features=features,
                          vectorizer=feat_vectorizer)
    if to_suffix == '.libsvm':
        Writer.for_path(train_path, train_fs, label_map=label_map).write()
    else:
        Writer.for_path(train_path, train_fs).write()

示例#29

0

显示文件

def make_conversion_data(num_feat_files,
                         from_suffix,
                         to_suffix,
                         with_labels=True):
    num_examples = 500
    num_feats_per_file = 7

    np.random.seed(1234567890)

    convert_dir = join(_my_dir, 'train', 'test_conversion')
    if not exists(convert_dir):
        os.makedirs(convert_dir)

    # Create lists we will write files from
    ids = []
    features = []
    labels = [] if with_labels else None
    for j in range(num_examples):
        y = "dog" if j % 2 == 0 else "cat"
        ex_id = "{}{}".format(y, j)
        # if we are not using labels, we do not want zero-valued features
        # because it may be the case that some subset of features end up
        # being all 0 and if this subset ends up being written out to a file
        # below, then for some formats (e.g., megam) nothing will get written
        # out which can cause issues when reading this file
        lowest_feature_value = 0 if with_labels else 1
        x = {
            "f{:03d}".format(feat_num):
            np.random.randint(lowest_feature_value, 4 + lowest_feature_value)
            for feat_num in range(num_feat_files * num_feats_per_file)
        }
        x = OrderedDict(sorted(x.items(), key=lambda t: t[0]))
        ids.append(ex_id)
        if with_labels:
            labels.append(y)
        features.append(x)

    # Create vectorizers/maps for libsvm subset writing
    feat_vectorizer = DictVectorizer()
    feat_vectorizer.fit(features)
    if with_labels:
        label_map = {
            label: num
            for num, label in enumerate(
                sorted({
                    label
                    for label in labels if not isinstance(label, (int, float))
                }))
        }
        # Add fake item to vectorizer for None
        label_map[None] = '00000'
    else:
        label_map = None

    # get the feature name prefix
    feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'),
                                            to_suffix.lstrip('.'))

    # use '_unlabeled' as part of any file names when not using labels
    with_labels_part = '' if with_labels else '_unlabeled'

    # Write out unmerged features in the `from_suffix` file format
    for i in range(num_feat_files):
        train_path = join(
            convert_dir, '{}_{}{}{}'.format(feature_name_prefix, i,
                                            with_labels_part, from_suffix))
        sub_features = []
        for example_num in range(num_examples):
            feat_num = i * num_feats_per_file
            x = {
                "f{:03d}".format(feat_num + j):
                features[example_num]["f{:03d}".format(feat_num + j)]
                for j in range(num_feats_per_file)
            }
            sub_features.append(x)
        train_fs = FeatureSet('sub_train',
                              ids,
                              labels=labels,
                              features=sub_features,
                              vectorizer=feat_vectorizer)
        if from_suffix == '.libsvm':
            Writer.for_path(train_path, train_fs, label_map=label_map).write()
        elif from_suffix in ['.arff', '.csv', '.tsv']:
            label_col = 'y' if with_labels else None
            Writer.for_path(train_path, train_fs, label_col=label_col).write()
        else:
            Writer.for_path(train_path, train_fs).write()

    # Write out the merged features in the `to_suffix` file format
    train_path = join(
        convert_dir, '{}{}_all{}'.format(feature_name_prefix, with_labels_part,
                                         to_suffix))
    train_fs = FeatureSet('train',
                          ids,
                          labels=labels,
                          features=features,
                          vectorizer=feat_vectorizer)

    # we need to do this to get around the FeatureSet using NaNs
    # instead of None when there are no labels which causes problems
    # later when comparing featuresets
    if not with_labels:
        train_fs.labels = [None] * len(train_fs.labels)

    if to_suffix == '.libsvm':
        Writer.for_path(train_path, train_fs, label_map=label_map).write()
    elif to_suffix in ['.arff', '.csv', '.tsv']:
        label_col = 'y' if with_labels else None
        Writer.for_path(train_path, train_fs, label_col=label_col).write()
    else:
        Writer.for_path(train_path, train_fs).write()

示例#30

0

显示文件

文件： readers.py 项目： monkidea/skll

    def read(self):
        """
        Loads examples in the `.arff`, `.csv`, `.jsonlines`, `.libsvm`,
        `.megam`, `.ndj`, or `.tsv` formats.

        Returns
        -------
        feature_set : skll.FeatureSet
            ``FeatureSet`` instance representing the input file.

        Raises
        ------
        ValueError
            If ``ids_to_floats`` is True, but IDs cannot be converted.
        ValueError
            If no features are found.
        ValueError
            If the example IDs are not unique.
        """
        self.logger.debug('Path: %s', self.path_or_list)

        if not self.quiet:
            self._progress_msg = "Loading {}...".format(self.path_or_list)
            print(self._progress_msg, end="\r", file=sys.stderr)
            sys.stderr.flush()

        # Get labels and IDs
        ids = []
        labels = []
        ex_num = 0
        with open(self.path_or_list, 'r' if PY3 else 'rb') as f:
            for ex_num, (id_, class_, _) in enumerate(self._sub_read(f),
                                                      start=1):
                # Update lists of IDs, clases, and features
                if self.ids_to_floats:
                    try:
                        id_ = float(id_)
                    except ValueError:
                        raise ValueError(('You set ids_to_floats to true,'
                                          ' but ID {} could not be '
                                          'converted to float in '
                                          '{}').format(id_, self.path_or_list))
                ids.append(id_)
                labels.append(class_)
                if ex_num % 100 == 0:
                    self._print_progress(ex_num)
            self._print_progress(ex_num)

        # Remember total number of examples for percentage progress meter
        total = ex_num
        if total == 0:
            raise ValueError("No features found in possibly "
                             "empty file '{}'.".format(self.path_or_list))

        # Convert everything to numpy arrays
        ids = np.array(ids)
        labels = np.array(labels)

        def feat_dict_generator():
            with open(self.path_or_list, 'r' if PY3 else 'rb') as f:
                for ex_num, (_, _, feat_dict) in enumerate(self._sub_read(f)):
                    yield feat_dict
                    if ex_num % 100 == 0:
                        self._print_progress('{:.8}%'.format(
                            100 * ((ex_num / total))))
                self._print_progress("100%")

        # Convert everything to numpy arrays
        features = self.vectorizer.fit_transform(feat_dict_generator())

        # Report that loading is complete
        self._print_progress("done", end="\n")

        # Make sure we have the same number of ids, labels, and features
        assert ids.shape[0] == labels.shape[0] == features.shape[0]

        if ids.shape[0] != len(set(ids)):
            raise ValueError('The example IDs are not unique in %s.' %
                             self.path_or_list)

        return FeatureSet(self.path_or_list,
                          ids,
                          labels=labels,
                          features=features,
                          vectorizer=self.vectorizer)