Exemplos de weighted_fmeasure em Python, exemplos de marmot.evaluation.evaluation_metrics.weighted_fmeasure em Python

Exemplo n.º 1

0

Exibir arquivo

def main(config):
    # load ContextCreators from config file, run their input functions, and pass the result into the initialization function
    # init() all context creators specified by the user with their arguments
    # import them according to their fully-specified class names in the config file
    # it's up to the user to specify context creators which extract both negative and positive examples (if that's what they want)

    # Chris - working - we want to hit every token
    interesting_tokens = experiment_utils.import_and_call_function(
        config['interesting_tokens'])
    print "INTERESTING TOKENS: ", interesting_tokens
    logger.info('The number of interesting tokens is: ' +
                str(len(interesting_tokens)))
    workers = config['workers']

    # Note: context creators currently create their own interesting tokens internally (interesting tokens controls the index of the context creator)
    logger.info('building the context creators...')
    train_context_creators = experiment_utils.build_objects(
        config['context_creators'])

    # get the contexts for all of our interesting words (may be +,- or, multi-class)
    logger.info(
        'mapping the training contexts over the interesting tokens in train...'
    )
    train_contexts = experiment_utils.map_contexts(interesting_tokens,
                                                   train_context_creators,
                                                   workers=workers)

    # load and parse the test data
    logger.info(
        'mapping the training contexts over the interesting tokens in test...')
    test_context_creator = experiment_utils.build_objects(config['testing'])
    test_contexts = experiment_utils.map_contexts(interesting_tokens,
                                                  [test_context_creator])

    min_total = config['filters']['min_total']
    # filter token contexts based on the user-specified filter criteria
    logger.info(
        'filtering the contexts by the total number of available instances...')
    train_contexts = experiment_utils.filter_contexts(train_contexts,
                                                      min_total=min_total)
    test_contexts = experiment_utils.filter_contexts(test_contexts,
                                                     min_total=min_total)

    # make sure the test_context and train_context keys are in sync
    experiment_utils.sync_keys(train_contexts, test_contexts)

    # test_contexts = filter_contexts(test_contexts, min_total=min_total)
    assert set(test_contexts.keys()) == set(train_contexts.keys())

    # extract the 'tag' attribute into the y-value for classification
    # tags may need to be converted to be consistent with the training data
    wmt_binary_classes = {u'BAD': 0, u'OK': 1}
    train_context_tags = experiment_utils.tags_from_contexts(train_contexts)
    train_context_tags = {
        k: np.array([wmt_binary_classes[v] for v in val])
        for k, val in train_context_tags.items()
    }

    test_contexts = experiment_utils.convert_tagset(wmt_binary_classes,
                                                    test_contexts)
    test_tags_actual = experiment_utils.tags_from_contexts(test_contexts)

    # all of the feature extraction should be parallelizable
    # note that a feature extractor MUST be able to parse the context exchange format, or it should throw an error:
    # { 'token': <token>, index: <idx>, 'source': [<source toks>]', 'target': [<target toks>], 'tag': <tag>}
    feature_extractors = experiment_utils.build_feature_extractors(
        config['feature_extractors'])
    logger.info('mapping the feature extractors over the contexts for test...')
    test_context_features = experiment_utils.token_contexts_to_features_categorical(
        test_contexts, feature_extractors, workers=workers)
    logger.info(
        'mapping the feature extractors over the contexts for train...')
    train_context_features = experiment_utils.token_contexts_to_features_categorical(
        train_contexts, feature_extractors, workers=workers)

    # flatten so that we can properly binarize the features
    all_values = experiment_utils.flatten(test_context_features.values())
    all_values.extend(experiment_utils.flatten(
        train_context_features.values()))
    binarizers = experiment_utils.fit_binarizers(all_values)
    test_context_features = {
        k: [experiment_utils.binarize(v, binarizers) for v in val]
        for k, val in test_context_features.items()
    }
    train_context_features = {
        k: [experiment_utils.binarize(v, binarizers) for v in val]
        for k, val in train_context_features.items()
    }

    # BEGIN LEARNING
    classifier_type = experiment_utils.import_class(
        config['learning']['classifier']['module'])
    # train the classifier for each token
    classifier_map = learning_utils.token_classifiers(train_context_features,
                                                      train_context_tags,
                                                      classifier_type)

    # classify the test instances
    # TODO: output a file in WMT format
    # WORKING - dump the output in WMT format
    logger.info('classifying the test instances')
    test_predictions = {}
    for key, features in test_context_features.iteritems():
        try:
            classifier = classifier_map[key]
            predictions = classifier.predict(features)
            test_predictions[key] = predictions
        except KeyError as e:
            print(key + " - is NOT in the classifier map")
            raise

    #### put the rest of the code into a separate 'evaluate' function that reads the WMT files

    # create the performance report for each word in the test data that we had a classifier for
    # TODO: Working - evaluate based on the format
    f1_map = {}
    for token, predicted in test_predictions.iteritems():
        logger.info("Evaluating results for token = " + token)
        actual = test_tags_actual[token]
        print 'Actual: ', actual
        print 'Predicted: ', predicted
        logger.info("\ttotal instances: " + str(len(predicted)))
        f1_map[token] = weighted_fmeasure(actual, predicted)

    logger.info('Printing the map of f1 scores by token: ')
    print(f1_map)

Exemplo n.º 2

0

Exibir arquivo

Arquivo: run_experiment.py Projeto: Sandy4321/marmot

def main(config):
    workers = config['workers']

    # REPRESENTATION GENERATION
    # main representations (source, target, tags)
    # training
    train_data_generators = build_objects(config['datasets']['training'])
    train_data = {}
    for gen in train_data_generators:
        data = gen.generate()
        for key in data:
            if key not in train_data:
                train_data[key] = []
            train_data[key].extend(data[key])
    # test
    test_data_generator = build_object(config['datasets']['test'][0])
    test_data = test_data_generator.generate()

    # additional representations
    if 'representations' in config:
        representation_generators = build_objects(config['representations'])
    else:
        representation_generators = []
    for r in representation_generators:
        train_data = r.generate(train_data)
        test_data = r.generate(test_data)

    logger.info('here are the keys in your representations: {}'.format(train_data.keys()))

    # the data_type is the format corresponding to the model of the data that the user wishes to learn
    data_type = config['contexts'] if 'contexts' in config else 'plain'

    test_contexts = create_contexts(test_data, data_type=data_type)
    train_contexts = create_contexts(train_data, data_type=data_type)

    logger.info('Vocabulary comparison -- coverage for each dataset: ')
    logger.info(compare_vocabulary([train_data['target'], test_data['target']]))
 
    # END REPRESENTATION GENERATION

    # FEATURE EXTRACTION
    # make sure the test_context and train_context keys are in sync
    # TODO: this is important when we are learning token-level classifiers
#    experiment_utils.sync_keys(train_contexts, test_contexts)

    # TODO: this is important when we are learning token-level classifiers
    # test_contexts = filter_contexts(test_contexts, min_total=min_total)
#    assert set(test_contexts.keys()) == set(train_contexts.keys())

    train_tags = call_for_each_element(train_contexts, tags_from_contexts, data_type=data_type)
    test_tags = call_for_each_element(test_contexts, tags_from_contexts, data_type=data_type)

    # all of the feature extraction should be parallelizable
    # note that a feature extractor MUST be able to parse the context exchange format, or it should throw an error:
    # { 'token': <token>, index: <idx>, 'source': [<source toks>]', 'target': [<target toks>], 'tag': <tag>}
    logger.info('creating feature extractors...')
    feature_extractors = build_objects(config['feature_extractors'])
    logger.info('mapping the feature extractors over the contexts for test...')
    test_features = call_for_each_element(test_contexts, contexts_to_features, [feature_extractors, workers], data_type=data_type)
    logger.info('mapping the feature extractors over the contexts for train...')
    train_features = call_for_each_element(train_contexts, contexts_to_features, [feature_extractors, workers], data_type=data_type)

    logger.info('number of training instances: {}'.format(len(train_features)))
    logger.info('number of testing instances: {}'.format(len(test_features)))

    logger.info('All of your features now exist in their raw representation, but they may not be numbers yet')
    # END FEATURE EXTRACTION

    # BEGIN CONVERTING FEATURES TO NUMBERS

    logger.info('binarization flag: {}'.format(config['features']['binarize']))
    # flatten so that we can properly binarize the features
    if config['features']['binarize'] is True:
        logger.info('Binarizing your features...')
        all_values = []
        if data_type == 'sequential':
            all_values = flatten(train_features)
        elif data_type == 'plain':
            all_values = train_features
        elif data_type == 'token':
            all_values = flatten(train_features.values())

        logger.info('fitting binarizers...')
        binarizers = fit_binarizers(all_values)
        logger.info('binarizing test data...')
        test_features = call_for_each_element(test_features, binarize, [binarizers], data_type=data_type)
        logger.info('binarizing training data...')
        # TODO: this line hangs with alignment+w2v
        train_features = call_for_each_element(train_features, binarize, [binarizers], data_type=data_type)

        logger.info('All of your features are now scalars in numpy arrays')

    logger.info('training and test sets successfully generated')

    # the way that we persist depends upon the structure of the data (plain/sequence/token_dict)
    # TODO: remove this once we have a list containing all datasets
    if config['features']['persist']:
        if 'persist_format' in config['features']:
            persist_format = config['features']['persist_format']
        else:
            persist_format = 'crf++'
        experiment_datasets = [{'name': 'test', 'features': test_features, 'tags': test_tags}, {'name': 'train', 'features': train_features, 'tags': train_tags}]
        feature_names = [f for extractor in feature_extractors for f in extractor.get_feature_names()]

        if config['features']['persist_dir']:
            persist_dir = config['features']['persist_dir']
        else:
            persist_dir = os.path.getcwd()
        logger.info('persisting your features to: '.format(persist_dir))
        # for each dataset, write a file and persist the features
        for dataset_obj in experiment_datasets:
            persist_features(dataset_obj['name'], dataset_obj['features'], persist_dir, feature_names=feature_names, tags=dataset_obj['tags'], file_format=persist_format)



    # TODO: we should only learn and evaluate the model if this is what the user wants
    # TODO: we should be able to dump the features for each of the user's datasets to a file specified by the user

    # BEGIN LEARNING

    # TODO: different sequence learning modules need different representation, we should wrap them in a class
    # TODO: create a consistent interface to sequence learners, will need to use *args and **kwargs because APIs are very different
    from sklearn.metrics import f1_score
    import numpy as np
    if data_type == 'sequential':
        logger.info('training sequential model...')

        # TODO: move the tag and array conversion code to the utils of this module
        # TODO: check if X and y are in the format we expect
        # TODO: don't hardcode the dictionary
        tag_map = {u'OK': 1, u'BAD': 0}
        train_tags = [[tag_map[tag] for tag in seq] for seq in train_tags]
        test_tags = [[tag_map[tag] for tag in seq] for seq in test_tags]

        # make sure that everything is numpy
        # cast the dataset to numpy array (ndarrays)
        # note that these are _NOT_ matrices, because the inner sequences have different lengths
        x_train = np.array([np.array(xi) for xi in train_features])
        y_train = np.array([np.array(xi) for xi in train_tags])
        x_test = np.array([numpy.array(xi) for xi in test_features])
        y_test = np.array([numpy.array(xi) for xi in test_tags])

        # SEQLEARN
        # from seqlearn.perceptron import StructuredPerceptron
        #
        # # seqlearn needs a flat list of instances
        # x_train = np.array([i for seq in x_train for i in seq])
        # y_train = np.array([i for seq in y_train for i in seq])
        # x_test = np.array([i for seq in x_test for i in seq])
        # y_test = np.array([i for seq in y_test for i in seq])
        #
        # # seqlearn requires the lengths of each sequence
        # lengths_train = [len(seq) for seq in train_features]
        # lengths_test = [len(seq) for seq in test_features]
        #
        # clf = StructuredPerceptron(verbose=True, max_iter=400)
        # clf.fit(x_train, y_train, lengths_train)
        #
        # structured_predictions = clf.predict(x_test, lengths_test)
        # logger.info('f1 from seqlearn: {}'.format(f1_score(y_test, structured_predictions, average=None)))

        # END SEQLEARN

        # pystruct
        from marmot.learning.pystruct_sequence_learner import PystructSequenceLearner
        sequence_learner = PystructSequenceLearner()
        sequence_learner.fit(x_train, y_train)
        structured_hyp = sequence_learner.predict(x_test)

        logger.info('scoring sequential model...')
        # print('score: ' + str(structured_predictor.score(x_test, y_test)))

        # TODO: implement this in the config
        # classifier_type = import_class(config['learning']['classifier']['module'])

        # TODO: the flattening is currently a hack to let us use the same evaluation code for structured and plain tasks
        flattened_hyp = flatten(structured_hyp)

        # end pystruct

        test_predictions = flattened_hyp
        flattened_ref = flatten(y_test)
        test_tags = flattened_ref

        logger.info('Structured prediction f1: ')
        print(f1_score(flattened_ref, flattened_hyp, average=None))

    else:
        # data_type is 'token' or 'plain'
        logger.info('start training...')
        classifier_type = import_class(config['learning']['classifier']['module'])
        # train the classifier(s)
        classifier_map = map_classifiers(train_features, train_tags, classifier_type, data_type=data_type)
        logger.info('classifying the test instances')
        test_predictions = predict_all(test_features, classifier_map, data_type=data_type)

    # TODO: this section only works for 'plain'

    print(f1_score(test_predictions, test_tags, average=None))

    # EVALUATION
    logger.info('evaluating your results')

    # TODO: remove the hard coding of the tags here
    bad_count = sum(1 for t in test_tags if t == u'BAD' or t == 0)
    good_count = sum(1 for t in test_tags if t == u'OK' or t == 1)

    total = len(test_tags)
    assert (total == bad_count+good_count), 'tag counts should be correct'
    percent_good = good_count / total
    logger.info('percent good in test set: {}'.format(percent_good))
    logger.info('percent bad in test set: {}'.format(1 - percent_good))

    import numpy as np

    random_class_results = []
    random_weighted_results = []
    for i in range(20):
        random_tags = list(np.random.choice([1, 0], total, [percent_good, 1-percent_good]))
        # random_tags = [u'GOOD' for i in range(total)]
        random_class_f1 = f1_score(test_tags, random_tags, average=None)
        random_class_results.append(random_class_f1)
        logger.info('two class f1 random score ({}): {}'.format(i, random_class_f1))
        # random_average_f1 = f1_score(random_tags, test_tags, average='weighted')
        random_average_f1 = weighted_fmeasure(test_tags, random_tags)
        random_weighted_results.append(random_average_f1)
        # logger.info('average f1 random score ({}): {}'.format(i, random_average_f1))

    avg_random_class = np.average(random_class_results, axis=0)
    avg_weighted = np.average(random_weighted_results)
    logger.info('two class f1 random average score: {}'.format(avg_random_class))
    logger.info('weighted f1 random average score: {}'.format(avg_weighted))

    actual_class_f1 = f1_score(test_tags, test_predictions, average=None)
    actual_average_f1 = weighted_fmeasure(test_tags, test_predictions)
    logger.info('two class f1 ACTUAL SCORE: {}'.format(actual_class_f1))
    logger.info('weighted f1 ACTUAL SCORE: {}'.format(actual_average_f1))

    if data_type == 'token':
        f1_map = {}
        for token, predicted in test_predictions.iteritems():
            logger.info("Evaluating results for token = " + token)
            actual = test_tags_actual[token]
#            print('Actual: ', actual)
#            print('Predicted: ', predicted)
#            logger.info("\ttotal instances: " + str(len(predicted)))
            f1_map[token] = weighted_fmeasure(actual, predicted)
            logger.info('Printing the map of f1 scores by token: ')
        print(f1_map)
    elif data_type == 'plain':
        f1 = weighted_fmeasure(test_tags, test_predictions)

Exemplo n.º 3

0

Exibir arquivo

Arquivo: wmt_word_level_experiment.py Projeto: Sandy4321/marmot

def main(config):
    # load ContextCreators from config file, run their input functions, and pass the result into the initialization function
    # init() all context creators specified by the user with their arguments
    # import them according to their fully-specified class names in the config file
    # it's up to the user to specify context creators which extract both negative and positive examples (if that's what they want)

    # Chris - working - we want to hit every token
    interesting_tokens = experiment_utils.import_and_call_function(config['interesting_tokens'])
    print "INTERESTING TOKENS: ", interesting_tokens
    logger.info('The number of interesting tokens is: ' + str(len(interesting_tokens)))
    workers = config['workers']

    # Note: context creators currently create their own interesting tokens internally (interesting tokens controls the index of the context creator)
    logger.info('building the context creators...')
    train_context_creators = experiment_utils.build_objects(config['context_creators'])

    # get the contexts for all of our interesting words (may be +,- or, multi-class)
    logger.info('mapping the training contexts over the interesting tokens in train...')
    train_contexts = experiment_utils.map_contexts(interesting_tokens, train_context_creators, workers=workers)

    # load and parse the test data
    logger.info('mapping the training contexts over the interesting tokens in test...')
    test_context_creator = experiment_utils.build_objects(config['testing'])
    test_contexts = experiment_utils.map_contexts(interesting_tokens, [test_context_creator])

    min_total = config['filters']['min_total']
    # filter token contexts based on the user-specified filter criteria
    logger.info('filtering the contexts by the total number of available instances...')
    train_contexts = experiment_utils.filter_contexts(train_contexts, min_total=min_total)
    test_contexts = experiment_utils.filter_contexts(test_contexts, min_total=min_total)

    # make sure the test_context and train_context keys are in sync
    experiment_utils.sync_keys(train_contexts, test_contexts)

    # test_contexts = filter_contexts(test_contexts, min_total=min_total)
    assert set(test_contexts.keys()) == set(train_contexts.keys())

    # extract the 'tag' attribute into the y-value for classification
    # tags may need to be converted to be consistent with the training data
    wmt_binary_classes = {u'BAD': 0, u'OK': 1}
    train_context_tags = experiment_utils.tags_from_contexts(train_contexts)
    train_context_tags = {k: np.array([wmt_binary_classes[v] for v in val]) for k, val in train_context_tags.items()}

    test_contexts = experiment_utils.convert_tagset(wmt_binary_classes, test_contexts)
    test_tags_actual = experiment_utils.tags_from_contexts(test_contexts)

    # all of the feature extraction should be parallelizable
    # note that a feature extractor MUST be able to parse the context exchange format, or it should throw an error:
    # { 'token': <token>, index: <idx>, 'source': [<source toks>]', 'target': [<target toks>], 'tag': <tag>}
    feature_extractors = experiment_utils.build_feature_extractors(config['feature_extractors'])
    logger.info('mapping the feature extractors over the contexts for test...')
    test_context_features = experiment_utils.token_contexts_to_features_categorical(test_contexts, feature_extractors, workers=workers)
    logger.info('mapping the feature extractors over the contexts for train...')
    train_context_features = experiment_utils.token_contexts_to_features_categorical(train_contexts, feature_extractors, workers=workers)

    # flatten so that we can properly binarize the features
    all_values = experiment_utils.flatten(test_context_features.values())
    all_values.extend(experiment_utils.flatten(train_context_features.values()))
    binarizers = experiment_utils.fit_binarizers(all_values)
    test_context_features = {k: [experiment_utils.binarize(v, binarizers) for v in val] for k, val in test_context_features.items()}
    train_context_features = {k: [experiment_utils.binarize(v, binarizers) for v in val] for k, val in train_context_features.items()}

    # BEGIN LEARNING
    classifier_type = experiment_utils.import_class(config['learning']['classifier']['module'])
    # train the classifier for each token
    classifier_map = learning_utils.token_classifiers(train_context_features, train_context_tags, classifier_type)

    # classify the test instances
    # TODO: output a file in WMT format
    # WORKING - dump the output in WMT format
    logger.info('classifying the test instances')
    test_predictions = {}
    for key, features in test_context_features.iteritems():
        try:
            classifier = classifier_map[key]
            predictions = classifier.predict(features)
            test_predictions[key] = predictions
        except KeyError as e:
            print(key + " - is NOT in the classifier map")
            raise

    #### put the rest of the code into a separate 'evaluate' function that reads the WMT files

    # create the performance report for each word in the test data that we had a classifier for
    # TODO: Working - evaluate based on the format
    f1_map = {}
    for token, predicted in test_predictions.iteritems():
        logger.info("Evaluating results for token = " + token)
        actual = test_tags_actual[token]
        print 'Actual: ', actual
        print 'Predicted: ', predicted
        logger.info("\ttotal instances: " + str(len(predicted)))
        f1_map[token] = weighted_fmeasure(actual, predicted)

    logger.info('Printing the map of f1 scores by token: ')
    print(f1_map)