Python token_contexts_to_features_categorical示例

编程语言: Python

命名空间/包名称: marmot.experiment.experiment_utils

方法/功能: token_contexts_to_features_categorical

hotexamples.com的示例: 3

Python token_contexts_to_features_categorical - 已找到3个示例。这些是从开源项目中提取的最受好评的marmot.experiment.experiment_utils.token_contexts_to_features_categorical现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： test_experiment.py 项目： Sandy4321/marmot

    def test_token_contexts_to_features_categorical(self):

        token_contexts = {}
        token_contexts['little'] = [{'index':1, 'token':u'little', 'target':[u'the', u'little', u'boy'], 'source':[u'le', u'petit', u'garcon'], 'alignments':[[0],[1],[2]], 'source_pos':[u'Art', u'Adj', u'Noun'], 'target_pos':[u'DT', u'JJ', u'NN']}, {'index':1, 'token':u'little', 'target':[u'a', u'little', u'dog'], 'source':[u'un', u'petit', u'chien'], 'alignments':[[0],[1],[2]], 'source_pos':[u'Art', u'Adj', u'Noun'], 'target_pos':[u'DT', u'JJ', u'NN']}, {'index':1, 'token':u'little', 'target':[u'a', u'little', u'cat'], 'source':[u'un', u'petit', u'chat'], 'alignments':[[0],[1],[2]], 'source_pos':[u'Art', u'Adj', u'Noun'], 'target_pos':[u'DT', u'JJN', u'NN']}]

        feature_extractor_list = self.config['feature_extractors']
        feature_extractors = import_utils.build_objects(feature_extractor_list)

        workers = 8
        mapped_contexts = experiment_utils.token_contexts_to_features_categorical(token_contexts, feature_extractors, workers=8)

        self.assertEqual(set(mapped_contexts.keys()), set(token_contexts.keys()))
        for tok, feature_vecs in mapped_contexts.items():
            self.assertTrue(len(feature_vecs) == len(token_contexts[tok]))
        context = mapped_contexts['little'][0]
        self.assertEqual(context[0], 3)
        self.assertEqual(context[1], 3)
        self.assertAlmostEqual( context[2], 1.0)
        self.assertEqual(context[3], u'petit')
        self.assertEqual(context[4], [u'le'])
        self.assertEqual(context[5], [u'garcon'])
        self.assertEqual([context[6], context[7], context[8], context[9]], [0,0,0,0])
        self.assertEqual(context[12], u'JJ')
        self.assertEqual(context[13], [u'Adj'])

示例#2

显示文件

def main(config):
    # load ContextCreators from config file, run their input functions, and pass the result into the initialization function
    # init() all context creators specified by the user with their arguments
    # import them according to their fully-specified class names in the config file
    # it's up to the user to specify context creators which extract both negative and positive examples (if that's what they want)

    # Chris - working - we want to hit every token
    interesting_tokens = experiment_utils.import_and_call_function(
        config['interesting_tokens'])
    print "INTERESTING TOKENS: ", interesting_tokens
    logger.info('The number of interesting tokens is: ' +
                str(len(interesting_tokens)))
    workers = config['workers']

    # Note: context creators currently create their own interesting tokens internally (interesting tokens controls the index of the context creator)
    logger.info('building the context creators...')
    train_context_creators = experiment_utils.build_objects(
        config['context_creators'])

    # get the contexts for all of our interesting words (may be +,- or, multi-class)
    logger.info(
        'mapping the training contexts over the interesting tokens in train...'
    )
    train_contexts = experiment_utils.map_contexts(interesting_tokens,
                                                   train_context_creators,
                                                   workers=workers)

    # load and parse the test data
    logger.info(
        'mapping the training contexts over the interesting tokens in test...')
    test_context_creator = experiment_utils.build_objects(config['testing'])
    test_contexts = experiment_utils.map_contexts(interesting_tokens,
                                                  [test_context_creator])

    min_total = config['filters']['min_total']
    # filter token contexts based on the user-specified filter criteria
    logger.info(
        'filtering the contexts by the total number of available instances...')
    train_contexts = experiment_utils.filter_contexts(train_contexts,
                                                      min_total=min_total)
    test_contexts = experiment_utils.filter_contexts(test_contexts,
                                                     min_total=min_total)

    # make sure the test_context and train_context keys are in sync
    experiment_utils.sync_keys(train_contexts, test_contexts)

    # test_contexts = filter_contexts(test_contexts, min_total=min_total)
    assert set(test_contexts.keys()) == set(train_contexts.keys())

    # extract the 'tag' attribute into the y-value for classification
    # tags may need to be converted to be consistent with the training data
    wmt_binary_classes = {u'BAD': 0, u'OK': 1}
    train_context_tags = experiment_utils.tags_from_contexts(train_contexts)
    train_context_tags = {
        k: np.array([wmt_binary_classes[v] for v in val])
        for k, val in train_context_tags.items()
    }

    test_contexts = experiment_utils.convert_tagset(wmt_binary_classes,
                                                    test_contexts)
    test_tags_actual = experiment_utils.tags_from_contexts(test_contexts)

    # all of the feature extraction should be parallelizable
    # note that a feature extractor MUST be able to parse the context exchange format, or it should throw an error:
    # { 'token': <token>, index: <idx>, 'source': [<source toks>]', 'target': [<target toks>], 'tag': <tag>}
    feature_extractors = experiment_utils.build_feature_extractors(
        config['feature_extractors'])
    logger.info('mapping the feature extractors over the contexts for test...')
    test_context_features = experiment_utils.token_contexts_to_features_categorical(
        test_contexts, feature_extractors, workers=workers)
    logger.info(
        'mapping the feature extractors over the contexts for train...')
    train_context_features = experiment_utils.token_contexts_to_features_categorical(
        train_contexts, feature_extractors, workers=workers)

    # flatten so that we can properly binarize the features
    all_values = experiment_utils.flatten(test_context_features.values())
    all_values.extend(experiment_utils.flatten(
        train_context_features.values()))
    binarizers = experiment_utils.fit_binarizers(all_values)
    test_context_features = {
        k: [experiment_utils.binarize(v, binarizers) for v in val]
        for k, val in test_context_features.items()
    }
    train_context_features = {
        k: [experiment_utils.binarize(v, binarizers) for v in val]
        for k, val in train_context_features.items()
    }

    # BEGIN LEARNING
    classifier_type = experiment_utils.import_class(
        config['learning']['classifier']['module'])
    # train the classifier for each token
    classifier_map = learning_utils.token_classifiers(train_context_features,
                                                      train_context_tags,
                                                      classifier_type)

    # classify the test instances
    # TODO: output a file in WMT format
    # WORKING - dump the output in WMT format
    logger.info('classifying the test instances')
    test_predictions = {}
    for key, features in test_context_features.iteritems():
        try:
            classifier = classifier_map[key]
            predictions = classifier.predict(features)
            test_predictions[key] = predictions
        except KeyError as e:
            print(key + " - is NOT in the classifier map")
            raise

    #### put the rest of the code into a separate 'evaluate' function that reads the WMT files

    # create the performance report for each word in the test data that we had a classifier for
    # TODO: Working - evaluate based on the format
    f1_map = {}
    for token, predicted in test_predictions.iteritems():
        logger.info("Evaluating results for token = " + token)
        actual = test_tags_actual[token]
        print 'Actual: ', actual
        print 'Predicted: ', predicted
        logger.info("\ttotal instances: " + str(len(predicted)))
        f1_map[token] = weighted_fmeasure(actual, predicted)

    logger.info('Printing the map of f1 scores by token: ')
    print(f1_map)

示例#3

显示文件

文件： wmt_word_level_experiment.py 项目： Sandy4321/marmot

def main(config):
    # load ContextCreators from config file, run their input functions, and pass the result into the initialization function
    # init() all context creators specified by the user with their arguments
    # import them according to their fully-specified class names in the config file
    # it's up to the user to specify context creators which extract both negative and positive examples (if that's what they want)

    # Chris - working - we want to hit every token
    interesting_tokens = experiment_utils.import_and_call_function(config['interesting_tokens'])
    print "INTERESTING TOKENS: ", interesting_tokens
    logger.info('The number of interesting tokens is: ' + str(len(interesting_tokens)))
    workers = config['workers']

    # Note: context creators currently create their own interesting tokens internally (interesting tokens controls the index of the context creator)
    logger.info('building the context creators...')
    train_context_creators = experiment_utils.build_objects(config['context_creators'])

    # get the contexts for all of our interesting words (may be +,- or, multi-class)
    logger.info('mapping the training contexts over the interesting tokens in train...')
    train_contexts = experiment_utils.map_contexts(interesting_tokens, train_context_creators, workers=workers)

    # load and parse the test data
    logger.info('mapping the training contexts over the interesting tokens in test...')
    test_context_creator = experiment_utils.build_objects(config['testing'])
    test_contexts = experiment_utils.map_contexts(interesting_tokens, [test_context_creator])

    min_total = config['filters']['min_total']
    # filter token contexts based on the user-specified filter criteria
    logger.info('filtering the contexts by the total number of available instances...')
    train_contexts = experiment_utils.filter_contexts(train_contexts, min_total=min_total)
    test_contexts = experiment_utils.filter_contexts(test_contexts, min_total=min_total)

    # make sure the test_context and train_context keys are in sync
    experiment_utils.sync_keys(train_contexts, test_contexts)

    # test_contexts = filter_contexts(test_contexts, min_total=min_total)
    assert set(test_contexts.keys()) == set(train_contexts.keys())

    # extract the 'tag' attribute into the y-value for classification
    # tags may need to be converted to be consistent with the training data
    wmt_binary_classes = {u'BAD': 0, u'OK': 1}
    train_context_tags = experiment_utils.tags_from_contexts(train_contexts)
    train_context_tags = {k: np.array([wmt_binary_classes[v] for v in val]) for k, val in train_context_tags.items()}

    test_contexts = experiment_utils.convert_tagset(wmt_binary_classes, test_contexts)
    test_tags_actual = experiment_utils.tags_from_contexts(test_contexts)

    # all of the feature extraction should be parallelizable
    # note that a feature extractor MUST be able to parse the context exchange format, or it should throw an error:
    # { 'token': <token>, index: <idx>, 'source': [<source toks>]', 'target': [<target toks>], 'tag': <tag>}
    feature_extractors = experiment_utils.build_feature_extractors(config['feature_extractors'])
    logger.info('mapping the feature extractors over the contexts for test...')
    test_context_features = experiment_utils.token_contexts_to_features_categorical(test_contexts, feature_extractors, workers=workers)
    logger.info('mapping the feature extractors over the contexts for train...')
    train_context_features = experiment_utils.token_contexts_to_features_categorical(train_contexts, feature_extractors, workers=workers)

    # flatten so that we can properly binarize the features
    all_values = experiment_utils.flatten(test_context_features.values())
    all_values.extend(experiment_utils.flatten(train_context_features.values()))
    binarizers = experiment_utils.fit_binarizers(all_values)
    test_context_features = {k: [experiment_utils.binarize(v, binarizers) for v in val] for k, val in test_context_features.items()}
    train_context_features = {k: [experiment_utils.binarize(v, binarizers) for v in val] for k, val in train_context_features.items()}

    # BEGIN LEARNING
    classifier_type = experiment_utils.import_class(config['learning']['classifier']['module'])
    # train the classifier for each token
    classifier_map = learning_utils.token_classifiers(train_context_features, train_context_tags, classifier_type)

    # classify the test instances
    # TODO: output a file in WMT format
    # WORKING - dump the output in WMT format
    logger.info('classifying the test instances')
    test_predictions = {}
    for key, features in test_context_features.iteritems():
        try:
            classifier = classifier_map[key]
            predictions = classifier.predict(features)
            test_predictions[key] = predictions
        except KeyError as e:
            print(key + " - is NOT in the classifier map")
            raise

    #### put the rest of the code into a separate 'evaluate' function that reads the WMT files

    # create the performance report for each word in the test data that we had a classifier for
    # TODO: Working - evaluate based on the format
    f1_map = {}
    for token, predicted in test_predictions.iteritems():
        logger.info("Evaluating results for token = " + token)
        actual = test_tags_actual[token]
        print 'Actual: ', actual
        print 'Predicted: ', predicted
        logger.info("\ttotal instances: " + str(len(predicted)))
        f1_map[token] = weighted_fmeasure(actual, predicted)

    logger.info('Printing the map of f1 scores by token: ')
    print(f1_map)