Пример #1
0
def test_ids_to_floats():
    path = os.path.join(_my_dir, 'train', 'test_input_2examples_1.jsonlines')

    examples = load_examples(path, ids_to_floats=True, quiet=True)
    assert isinstance(examples.ids[0], float)

    examples = load_examples(path, quiet=True)
    assert not isinstance(examples.ids[0], float)
    assert isinstance(examples.ids[0], str)
Пример #2
0
def test_backward_compatibility():
    '''
    Verify that a model from v0.9.17 can still be loaded and generate the same predictions.
    '''
    predict_path = os.path.join(_my_dir, 'backward_compatibility',
                                'v0.9.17_test_summary_test_summary_LogisticRegression.predictions')
    model_path = os.path.join(_my_dir, 'backward_compatibility',
                              'v0.9.17_test_summary_test_summary_LogisticRegression.{}.model'.format(sys.version_info[0]))
    test_path = os.path.join(_my_dir, 'backward_compatibility', 'v0.9.17_test_summary.jsonlines')

    learner = Learner.from_file(model_path)
    examples = load_examples(test_path, quiet=True)
    new_predictions = learner.predict(examples)[:, 1]

    with open(predict_path) as predict_file:
        for line, new_val in zip(predict_file, new_predictions):
            assert_almost_equal(float(line.strip()), new_val)
def compute_eval_from_predictions(examples_file, predictions_file,
                                  metric_names):
    '''
    Compute evaluation metrics from prediction files after you have run an
    experiment.

    :param examples_file: a SKLL examples file (in .jsonlines or other format)
    :param predictions_file: a SKLL predictions output TSV file with id
                             and prediction column names
    :param metric_names: a list of SKLL metric names
                         (e.g., [pearson, unweighted_kappa])

    :returns: a dictionary from metrics names to values
    '''

    # read gold standard labels
    data = load_examples(examples_file)
    gold = dict(zip(data.ids, data.classes))

    # read predictions
    pred = {}
    with open(predictions_file) as pred_file:
        reader = csv.reader(pred_file, dialect=csv.excel_tab)
        next(reader)  # skip header
        for row in reader:
            pred[row[0]] = float(row[1])

    # make a sorted list of example ids in order to match up
    # labels and predictions
    if set(gold.keys()) != set(pred.keys()):
        raise ValueError('The example and prediction IDs do not match.')
    example_ids = sorted(gold.keys())

    res = {}
    for metric_name in metric_names:
        score = use_score_func(metric_name,
                               [gold[ex_id] for ex_id in example_ids],
                               [pred[ex_id] for ex_id in example_ids])
        res[metric_name] = score
    return res
Пример #4
0
def compute_eval_from_predictions(examples_file, predictions_file,
                                  metric_names):
    '''
    Compute evaluation metrics from prediction files after you have run an
    experiment.

    :param examples_file: a SKLL examples file (in .jsonlines or other format)
    :param predictions_file: a SKLL predictions output TSV file with id
                             and prediction column names
    :param metric_names: a list of SKLL metric names
                         (e.g., [pearson, unweighted_kappa])

    :returns: a dictionary from metrics names to values
    '''

    # read gold standard labels
    data = load_examples(examples_file)
    gold = dict(zip(data.ids, data.classes))

    # read predictions
    pred = {}
    with open(predictions_file) as pred_file:
        reader = csv.reader(pred_file, dialect=csv.excel_tab)
        next(reader)  # skip header
        for row in reader:
            pred[row[0]] = float(row[1])

    # make a sorted list of example ids in order to match up
    # labels and predictions
    if set(gold.keys()) != set(pred.keys()):
        raise ValueError('The example and prediction IDs do not match.')
    example_ids = sorted(gold.keys())

    res = {}
    for metric_name in metric_names:
        score = use_score_func(metric_name,
                               [gold[ex_id] for ex_id in example_ids],
                               [pred[ex_id] for ex_id in example_ids])
        res[metric_name] = score
    return res
Пример #5
0
def _load_featureset(dir_path, feat_files, suffix, label_col='y',
                     ids_to_floats=False, quiet=False, class_map=None,
                     feature_hasher=False, num_features=None):
    '''
    Load a list of feature files and merge them.

    :param dir_path: Path to the directory that contains the feature files.
    :type dir_path: str
    :param feat_files: List of feature file prefixes
    :type feat_files: str
    :param suffix: Suffix to add to feature file prefixes to get full
                   filenames.
    :type suffix: str
    :param label_col: Name of the column which contains the class labels.
                      If no column with that name exists, or `None` is
                      specified, the data is considered to be unlabelled.
    :type label_col: str
    :param ids_to_floats: Convert IDs to float to save memory. Will raise error
                          if we encounter an a non-numeric ID.
    :type ids_to_floats: bool
    :param quiet: Do not print "Loading..." status message to stderr.
    :type quiet: bool
    :param class_map: Mapping from original class labels to new ones. This is
                      mainly used for collapsing multiple classes into a single
                      class. Anything not in the mapping will be kept the same.
    :type class_map: dict from str to str

    :returns: The classes, IDs, features, and feature vectorizer representing
              the given featureset.
    :rtype: FeatureSet
    '''
    merged_set = FeatureSet('')
    for file_name in sorted(os.path.join(dir_path, featfile + suffix) for
                            featfile in feat_files):
        merged_set += load_examples(file_name, label_col=label_col,
                                    ids_to_floats=ids_to_floats, quiet=quiet,
                                    class_map=class_map,
                                    feature_hasher=feature_hasher,
                                    num_features=num_features)
    return merged_set
Пример #6
0
def _load_featureset(dirpath,
                     featureset,
                     suffix,
                     label_col='y',
                     ids_to_floats=False,
                     quiet=False,
                     class_map=None,
                     unlabelled=False):
    '''
    Load a list of feature files and merge them.

    :param dirpath: Path to the directory that contains the feature files.
    :type dirpath: str
    :param featureset: List of feature file prefixes
    :type featureset: str
    :param suffix: Suffix to add to feature file prefixes to get full filenames.
    :type suffix: str
    :param label_col: Name of the column which contains the class labels.
                      If no column with that name exists, or `None` is
                      specified, the data is considered to be unlabelled.
    :type label_col: str
    :param ids_to_floats: Convert IDs to float to save memory. Will raise error
                          if we encounter an a non-numeric ID.
    :type ids_to_floats: bool
    :param quiet: Do not print "Loading..." status message to stderr.
    :type quiet: bool
    :param class_map: Mapping from original class labels to new ones. This is
                      mainly used for collapsing multiple classes into a single
                      class. Anything not in the mapping will be kept the same.
    :type class_map: dict from str to str
    :param unlabelled: Is this test we're loading? If so, don't raise an error
                       if there are no labels.
    :type unlabelled: bool

    :returns: The classes, IDs, features, and feature vectorizer representing
              the given featureset.
    :rtype: ExamplesTuple
    '''

    # Load a list of lists of examples, one list of examples per featureset.
    file_names = sorted(
        os.path.join(dirpath, featfile + suffix) for featfile in featureset)
    example_tuples = [
        load_examples(file_name,
                      label_col=label_col,
                      ids_to_floats=ids_to_floats,
                      quiet=quiet,
                      class_map=class_map) for file_name in file_names
    ]

    # Check that the IDs are unique within each file.
    for file_name, examples in zip(file_names, example_tuples):
        ex_ids = examples.ids
        if len(ex_ids) != len(set(ex_ids)):
            raise ValueError(('The example IDs are not unique in ' +
                              '{}.').format(file_name))

    # Check that the different feature files have the same IDs.
    # To do this, make a sorted tuple of unique IDs for each feature file,
    # and then make sure they are all the same by making sure the set has one
    # item in it.
    mismatch_num = len(
        {tuple(sorted(examples.ids))
         for examples in example_tuples})
    if mismatch_num != 1:
        raise ValueError(('The sets of example IDs in {} feature files do ' +
                          'not match').format(mismatch_num))

    # Make sure there is a unique label for every example (or no label, for
    # "unseen" examples).
    # To do this, find the unique (id, y) tuples, and then make sure that all
    # those ids are unique.
    unique_tuples = set(
        chain(
            *[[(curr_id, curr_label)
               for curr_id, curr_label in zip(examples.ids, examples.classes)]
              for examples in example_tuples
              if any(x is not None for x in examples.classes)]))
    if len({tup[0] for tup in unique_tuples}) != len(unique_tuples):
        raise ValueError('At least two feature files have different labels ' +
                         '(i.e., y values) for the same ID.')

    # Now, create the final ExamplesTuple of examples with merged features
    merged_vectorizer = None
    merged_features = None
    merged_ids = None
    merged_classes = None
    for ids, classes, features, feat_vectorizer in example_tuples:
        # Combine feature matrices and vectorizers
        if merged_features is not None:
            # Check for duplicate feature names
            if (set(merged_vectorizer.get_feature_names())
                    & set(feat_vectorizer.get_feature_names())):
                raise ValueError('Two feature files have the same feature!')

            num_merged = merged_features.shape[1]
            merged_features = sp.hstack([merged_features, features], 'csr')

            # dictvectorizer sorts the vocabularies within each file
            for feat_name, index in sorted(feat_vectorizer.vocabulary_.items(),
                                           key=lambda x: x[1]):
                merged_vectorizer.vocabulary_[feat_name] = index + num_merged
                merged_vectorizer.feature_names_.append(feat_name)
        else:
            merged_features = features
            merged_vectorizer = feat_vectorizer

        # IDs should be the same for each ExamplesTuple, so only store once
        if merged_ids is None:
            merged_ids = ids
        # Check that IDs are in the same order
        elif not np.all(merged_ids == ids):
            raise ValueError('IDs are not in the same order in each feature ' +
                             'file!')

        # If current ExamplesTuple has labels, check that they don't conflict
        if any(x is not None for x in classes):
            # Classes should be the same for each ExamplesTuple, so store once
            if merged_classes is None:
                merged_classes = classes
            # Check that classes don't conflict, when specified
            elif not np.all(merged_classes == classes):
                raise ValueError('Feature files have conflicting labels for ' +
                                 'examples with the same ID!')

    # Ensure that at least one file had classes if we're expecting them
    if merged_classes is None and not unlabelled:
        raise ValueError('No feature files in feature set contain class' +
                         'labels!')

    return ExamplesTuple(merged_ids, merged_classes, merged_features,
                         merged_vectorizer)
Пример #7
0
def _load_featureset(dirpath, featureset, suffix, label_col='y',
                     ids_to_floats=False, quiet=False, class_map=None,
                     unlabelled=False):
    '''
    Load a list of feature files and merge them.

    :param dirpath: Path to the directory that contains the feature files.
    :type dirpath: str
    :param featureset: List of feature file prefixes
    :type featureset: str
    :param suffix: Suffix to add to feature file prefixes to get full filenames.
    :type suffix: str
    :param label_col: Name of the column which contains the class labels.
                      If no column with that name exists, or `None` is
                      specified, the data is considered to be unlabelled.
    :type label_col: str
    :param ids_to_floats: Convert IDs to float to save memory. Will raise error
                          if we encounter an a non-numeric ID.
    :type ids_to_floats: bool
    :param quiet: Do not print "Loading..." status message to stderr.
    :type quiet: bool
    :param class_map: Mapping from original class labels to new ones. This is
                      mainly used for collapsing multiple classes into a single
                      class. Anything not in the mapping will be kept the same.
    :type class_map: dict from str to str
    :param unlabelled: Is this test we're loading? If so, don't raise an error
                       if there are no labels.
    :type unlabelled: bool

    :returns: The classes, IDs, features, and feature vectorizer representing
              the given featureset.
    :rtype: ExamplesTuple
    '''

    # Load a list of lists of examples, one list of examples per featureset.
    file_names = sorted(os.path.join(dirpath, featfile + suffix) for featfile
                        in featureset)
    example_tuples = [load_examples(file_name, label_col=label_col,
                                    ids_to_floats=ids_to_floats, quiet=quiet,
                                    class_map=class_map)
                      for file_name in file_names]

    # Check that the IDs are unique within each file.
    for file_name, examples in zip(file_names, example_tuples):
        ex_ids = examples.ids
        if len(ex_ids) != len(set(ex_ids)):
            raise ValueError(('The example IDs are not unique in ' +
                              '{}.').format(file_name))

    # Check that the different feature files have the same IDs.
    # To do this, make a sorted tuple of unique IDs for each feature file,
    # and then make sure they are all the same by making sure the set has one
    # item in it.
    mismatch_num = len({tuple(sorted(examples.ids)) for examples in
                        example_tuples})
    if mismatch_num != 1:
        raise ValueError(('The sets of example IDs in {} feature files do ' +
                          'not match').format(mismatch_num))

    # Make sure there is a unique label for every example (or no label, for
    # "unseen" examples).
    # To do this, find the unique (id, y) tuples, and then make sure that all
    # those ids are unique.
    unique_tuples = set(chain(*[[(curr_id, curr_label) for curr_id, curr_label
                                 in zip(examples.ids, examples.classes)]
                                for examples in example_tuples if
                                any(x is not None for x in examples.classes)]))
    if len({tup[0] for tup in unique_tuples}) != len(unique_tuples):
        raise ValueError('At least two feature files have different labels ' +
                         '(i.e., y values) for the same ID.')

    # Now, create the final ExamplesTuple of examples with merged features
    merged_vectorizer = None
    merged_features = None
    merged_ids = None
    merged_classes = None
    for ids, classes, features, feat_vectorizer in example_tuples:
        # Combine feature matrices and vectorizers
        if merged_features is not None:
            # Check for duplicate feature names
            if (set(merged_vectorizer.get_feature_names()) &
                    set(feat_vectorizer.get_feature_names())):
                raise ValueError('Two feature files have the same feature!')

            num_merged = merged_features.shape[1]
            merged_features = sp.hstack([merged_features, features], 'csr')

            # dictvectorizer sorts the vocabularies within each file
            for feat_name, index in sorted(feat_vectorizer.vocabulary_.items(),
                                           key=lambda x: x[1]):
                merged_vectorizer.vocabulary_[feat_name] = index + num_merged
                merged_vectorizer.feature_names_.append(feat_name)
        else:
            merged_features = features
            merged_vectorizer = feat_vectorizer

        # IDs should be the same for each ExamplesTuple, so only store once
        if merged_ids is None:
            merged_ids = ids
        # Check that IDs are in the same order
        elif not np.all(merged_ids == ids):
            raise ValueError('IDs are not in the same order in each feature ' +
                             'file!')

        # If current ExamplesTuple has labels, check that they don't conflict
        if any(x is not None for x in classes):
            # Classes should be the same for each ExamplesTuple, so store once
            if merged_classes is None:
                merged_classes = classes
            # Check that classes don't conflict, when specified
            elif not np.all(merged_classes == classes):
                raise ValueError('Feature files have conflicting labels for ' +
                                 'examples with the same ID!')

    # Ensure that at least one file had classes if we're expecting them
    if merged_classes is None and not unlabelled:
        raise ValueError('No feature files in feature set contain class' +
                         'labels!')

    return ExamplesTuple(merged_ids, merged_classes, merged_features,
                         merged_vectorizer)