Exemplo n.º 1
0
def main(
    training_set,
    language,
    outfile,
    model_class,
    model_param,
    extractor_class,
    extractor_param,
    gazetteer,
    folds,
    scoring,
    skip_majority,
    evaluate_gold,
):
    """ Trains the classifier """

    gazetteer = reverse_gazetteer(json.load(gazetteer)) if gazetteer else {}

    model_cls, model_args = initialize(model_class, model_param, False)

    if evaluate_gold:
        gold_extractor = initialize(extractor_class, [("language", language)] + list(extractor_param), True)

        gold_evaluation(map(json.loads, training_set), gold_extractor, gazetteer, model_cls, model_args)

        training_set.seek(0)

    extractor = initialize(extractor_class, [("language", language)] + list(extractor_param), True)

    logger.info("Building training set from '%s' ..." % training_set.name)
    for row in training_set:
        data = json.loads(row)
        extractor.process_sentence(data["sentence"], data["lu"], data["fes"], add_unknown=True, gazetteer=gazetteer)
    x, y = extractor.get_features(refit=True)
    logger.info("Got %d samples with %d features each", *x.shape)

    model = FeatureSelectedClassifier(model_cls, extractor.lu_column(), model_args)

    if folds > 1:
        kfolds_evaluation(folds, model, scoring, skip_majority, x, y)

    logger.info("Fitting model ...")
    model.fit(x, y)

    joblib.dump((model, {"extractor": extractor}), outfile)

    logger.info("Done, dumped model to '%s'", outfile)
Exemplo n.º 2
0
def main(training_set, language, outfile, model_class, model_param, extractor_class,
         extractor_param, gazetteer, folds, scoring, skip_majority, evaluate_gold):
    """ Trains the classifier """

    gazetteer = reverse_gazetteer(json.load(gazetteer)) if gazetteer else {}

    model_cls, model_args = initialize(model_class, model_param, False)

    if evaluate_gold:
        gold_extractor = initialize(
            extractor_class, [('language', language)] + list(extractor_param), True
        )

        gold_evaluation(
            map(json.loads, training_set), gold_extractor,
            gazetteer, model_cls, model_args
        )

        training_set.seek(0)

    extractor = initialize(extractor_class, [('language', language)] + list(extractor_param), True)

    logger.info("Building training set from '%s' ..." % training_set.name)
    for row in training_set:
        data = json.loads(row)
        extractor.process_sentence(data['sentence'], data['lu'], data['fes'],
                                   add_unknown=True, gazetteer=gazetteer)
    x, y = extractor.get_features(refit=True)
    logger.info('Got %d samples with %d features each', *x.shape)

    model = FeatureSelectedClassifier(model_cls, extractor.lu_column(), model_args)

    if folds > 1:
        kfolds_evaluation(folds, model, scoring, skip_majority, x, y)

    logger.info('Fitting model ...')
    model.fit(x, y)

    joblib.dump((model, {
        'extractor': extractor
    }), outfile)

    logger.info("Done, dumped model to '%s'", outfile)
Exemplo n.º 3
0
def gold_evaluation(sentences, extractor, gazetteer, model_cls, model_args):
    logger.info("Evaluating on the gold sentences")

    for each in sentences:
        if not each.get("gold_fes"):
            extractor.process_sentence(each["sentence"], each["lu"], each["fes"], add_unknown=True, gazetteer=gazetteer)
    x_tr, y_tr = extractor.get_features(refit=True)

    extractor.start()
    tagged_gold = []
    for each in sentences:
        if each.get("gold_fes"):
            tagged_gold.append(
                (
                    each["gold_fes"],
                    extractor.process_sentence(
                        each["sentence"], each["lu"], each["fes"], add_unknown=False, gazetteer=gazetteer
                    ),
                )
            )

    if not tagged_gold:
        logger.warn("asked to evaluate gold, but no gold sentences found")
        return

    x_gold, _ = extractor.get_features(refit=False)
    y_gold = []
    for gold_fes, tagged in tagged_gold:
        for chunk, is_sample in tagged:
            if is_sample:
                y_gold.append([extractor.label_index[fe or "O"] for fe in gold_fes.get(chunk, [])])

    assert len(y_gold) == x_gold.shape[0]

    model = FeatureSelectedClassifier(model_cls, extractor.lu_column(), model_args)
    model.fit(x_tr, y_tr)
    y_pred = model.predict(x_gold)

    correct = len([1 for actual, predicted in zip(y_gold, y_pred) if predicted in actual])
    logger.info("Gold accuracy: %f (%d / %d roles)", float(correct) / len(y_gold), correct, len(y_gold))
Exemplo n.º 4
0
def gold_evaluation(sentences, extractor, gazetteer, model_cls, model_args):
    logger.info('Evaluating on the gold sentences')

    for each in sentences:
        if not each.get('gold_fes'):
            extractor.process_sentence(each['sentence'], each['lu'], each['fes'],
                                       add_unknown=True, gazetteer=gazetteer)
    x_tr, y_tr = extractor.get_features(refit=True)

    extractor.start()
    tagged_gold = []
    for each in sentences:
        if each.get('gold_fes'):
            tagged_gold.append((each['gold_fes'], extractor.process_sentence(
                each['sentence'], each['lu'], each['fes'],
                add_unknown=False, gazetteer=gazetteer
            )))

    if not tagged_gold:
        logger.warn('asked to evaluate gold, but no gold sentences found')
        return

    x_gold, _ = extractor.get_features(refit=False)
    y_gold = []
    for gold_fes, tagged in tagged_gold:
        for chunk, is_sample in tagged:
            if is_sample:
                y_gold.append([extractor.label_index[fe or 'O'] for fe in gold_fes.get(chunk, [])])

    assert len(y_gold) == x_gold.shape[0]

    model = FeatureSelectedClassifier(model_cls, extractor.lu_column(), model_args)
    model.fit(x_tr, y_tr)
    y_pred = model.predict(x_gold)

    correct = len([1 for actual, predicted in zip(y_gold, y_pred) if predicted in actual])
    logger.info('Gold accuracy: %f (%d / %d roles)', float(correct) / len(y_gold),
                correct, len(y_gold))
Exemplo n.º 5
0
    def fit(self, training_sets):
        """ Searches for the best estimator and its arguments as well as the best
            training set amongst those specified.

            :param generator training_sets: Training set to use. Should be a sequence
             of tuples (x, y, metadata) where x is the training set, y is the
             correct answer for each chunk and metadata contains additional data that will
             be returned back
            :return: the metadata of the training set which yielded the best score,
             the best score obtained by the model, parameters of the model and
             fitted model itself
            :rtype: tuple
        """
        best_training, best_score, best_params, best_model = None, None, None, None
        for i, (metadata, extractor) in enumerate(training_sets):
            for model, grid in self.models:
                assert isclass(model)

                x, y = extractor.get_features(refit=True)

                grid['model_cls'] = [model]
                grid['selector_column'] = [None, extractor.lu_column()]

                search = GridSearchCV(FeatureSelectedClassifier(model),
                                      param_grid=grid,
                                      **self.kwargs)
                search.fit(x, y)

                score, params, model = search.best_score_, search.best_params_, search.best_estimator_
                logger.debug(
                    '%s with parameters %s and training meta %s has score %s',
                    type(model), params, metadata, score)
                if best_score is None or score > best_score:
                    best_training, best_score, best_params, best_model = (
                        x, y, metadata), score, params, model

        return best_training, best_score, best_params, best_model