예제 #1
0
def _cross_validation_training(feature_maker, feature_names):
    """Finds the best models for features by cross-validation."""
    use_implicationals_values = [True, False]
    best_configs = collections.defaultdict(
        lambda: collections.defaultdict(str))
    total_num_configs = len(feature_names) * 2  # This excludes the models.
    n = 1
    for feature_name in feature_names:
        best_models = []
        for use_implicationals in use_implicationals_values:
            logging.info("CV training: %d/%d", n, total_num_configs)
            # Process training and dev data for the feature.
            X_train, y_train, _, _, _ = classifier_lib.prepare_data(
                feature_maker,
                feature_name,
                use_implicationals=use_implicationals)

            # Find the best model for the feature.
            best_model_info = classifier_lib.select_best_model(
                classifier_lib.ALL_MODELS, feature_name, X_train, y_train,
                FLAGS.cv_num_folds, FLAGS.cv_num_repeats)
            best_models.append((use_implicationals, best_model_info))
            n += 1

        # Select the best model with or without implicationals.
        best_models = sorted(
            best_models,
            key=lambda info: info[1][classifier_lib.MODEL_INFO_SCORE_KEY],
            reverse=True)
        best_configs[feature_name]["use_implicationals"] = best_models[0][0]
        best_configs[feature_name]["model"] = best_models[0][1]

    return best_configs
예제 #2
0
    def _prepare_data_worker(self, feature_name):
        """Prepares data for individual feature.

    Decision to use or ignore the implicationals is taken based on
    cross-validation configuration. Returns a set of input features
    for prediction and corresponding language codes.

    Args:
      feature_name: (string) Name of the feature.

    Returns:
      A triple consisting of name of the WALS feature (string), the evalation
      (or test) input features for the classifier (numpy array) and a list of
      language codes (WALS codes), where each code corresponds to a single
      row in eval feature data.
    """
        use_implicationals = True
        if feature_name in self._configs:
            use_implicationals = self._configs[feature_name][
                "use_implicationals"]
        _, _, X_dev, _, eval_language_codes, _ = classifier_lib.prepare_data(
            self._feature_maker,
            feature_name,
            use_implicationals=use_implicationals,
            prediction_mode=self._prediction_mode)
        if X_dev.shape[0] != len(eval_language_codes):
            raise ValueError(
                "Number of eval examples (%d) mismatches number of "
                "languages (%d)!" % (X_dev.shape[0], len(eval_language_codes)))
        return feature_name, X_dev, eval_language_codes
예제 #3
0
def _train_and_evaluate(feature_maker, feature_names):
    """Train and evaluate a particular feature.

  Please note: This mode is more suitable for proper evaluation rather than a
  lengthy cross-validation-based training.

  Args:
    feature_maker: (object) Feature builder.
    feature_names: (list) List of WALS feature names (strings).
  """
    for feature_name in feature_names:
        try:
            # Process training and dev data for the feature.
            X_train, y_train, X_dev, y_dev, _, _ = classifier_lib.prepare_data(
                feature_maker,
                feature_name,
                use_implicationals=FLAGS.use_implicationals)

            # Train and evaluate models.
            best_acc = 0
            best_classifier = ""
            for classifier_name in FLAGS.classifiers:
                acc = _train_and_evaluate_model(feature_name, classifier_name,
                                                X_train, y_train, X_dev, y_dev)
                if acc > best_acc:
                    best_acc = acc
                    best_classifier = classifier_name
            print("=== [{}] {}: Dev set: Best Accuracy {}".format(
                feature_name, best_classifier, best_acc))

        except Exception:  # pylint: disable=broad-except
            if not FLAGS.catch_exceptions:
                raise
            logging.warning("Problem with processing feature: %s",
                            feature_name)
예제 #4
0
    def _train_model_worker(self, feature_name):
        """Train individual classifier in a single thread."""
        model_is_reliable = True
        if not FLAGS.force_classifier:
            # Select classifiers from the best configuration.
            model_name = _DEFAULT_CLASSIFIER_NAME
            if feature_name in self._configs:
                assert "model" in self._configs[feature_name]
                model_config = self._configs[feature_name]["model"]
                model_name = model_config[classifier_lib.MODEL_INFO_NAME_KEY]
                should_ignore = model_config[
                    classifier_lib.MODEL_INFO_SPARSITY_KEY]
                score = model_config[classifier_lib.MODEL_INFO_SCORE_KEY]
                if should_ignore or score < _BAD_ACCURACY_THRESHOLD:
                    # Not enough training data or low CV accuracy score. Fall back to
                    # search-based approaches.
                    logging.warning("[%s] No reliable models found",
                                    feature_name)
                    model_is_reliable = False
        else:
            # Use single classifier for everything.
            model_name = FLAGS.force_classifier

        if model_is_reliable:
            # Train the model. Please note, the training features have already been
            # constructed and cached by the feature maker during the data preparation
            # step preceding the training.
            logging.info("[%s] %s: \"%s\" ...", self._name, feature_name,
                         model_name)
            use_implicationals = True
            if not FLAGS.force_classifier:
                use_implicationals = self._configs[feature_name][
                    "use_implicationals"]
            X_train, y_train, _, _, _, train_class_counts = (
                classifier_lib.prepare_data(
                    self._feature_maker,
                    feature_name,
                    use_implicationals=use_implicationals,
                    prediction_mode=self._prediction_mode))
            ymax_freq = train_class_counts[0][1]  # Highest frequency.
            model = classifier_lib.train_classifier(feature_name, model_name,
                                                    X_train, y_train)
        return feature_name, model, ymax_freq