Пример #1
0
def iter_configs(input_file_path, dbname):
    input_file_path = os.path.abspath(input_file_path)
    hasher = hashlib.md5(open(input_file_path, 'rb').read())
    base = {
        # Experiment configuration
        u'config_version': u'6',
        u'input_file_path': input_file_path,
        u'input_file_md5': hasher.hexdigest(),
        u'database_name': dbname
    }

    prediction_patch = {
        u'method': [u'predict', u'predict_proba', 'decision_function'],
    }
    seed_patch = {
        u'shuffle': [u"%i lemon and half lemon" % i for i in range(3)]
    }

    for champ, classifier in product(loop_champions_round_4,
                                     candidate_classifiers):
        champ = deepcopy(champ)
        champ.update(base)
        champ['classifier_config'] = deepcopy(classifier)

        prediction_options_range = list(
            apply_dict_combinations(champ['prediction_config'],
                                    prediction_patch))
        seed_options_range = list(
            apply_dict_combinations(champ[u'seed_facts'], seed_patch))
        patch = {
            u'prediction_config': prediction_options_range,
            u'questions_sorting': [u'score', u'certainty'],
            u'seed_facts': seed_options_range
        }
        for config in apply_dict_combinations(champ, patch):
            config = deepcopy(config)
            if config['prediction_config']['method'] == u'predict_proba':
                config['classifier_config']['classifier_args'][
                    u'probability'] = True
            if config['prediction_config']['method'] == u'decision_function':
                config['classifier_config'][
                    'sparse'] = False  # in some cases is failing

            yield config
            config = deepcopy(config)
            config['classifier_config']['feature_selection_dimension'] = 10
            yield config
Пример #2
0
def iter_configs(input_file_path, dbname):
    input_file_path = os.path.abspath(input_file_path)
    hasher = hashlib.md5(open(input_file_path, "rb").read())
    base = {
        # Experiment configuration
        u"config_version": u"1",
        u"data_shuffle_seed": None,
        u"train_percentage": None,
        u"input_file_path": input_file_path,
        u"input_file_md5": hasher.hexdigest(),
        u"database_name": dbname,

        # Classifier configuration
        u"classifier": u"svm",
        u"classifier_args": dict(),
        u"dimensionality_reduction": None,
        u"dimensionality_reduction_dimension": None,
        u"feature_selection": None,
        u"feature_selection_dimension": None,
        u"scaler": True,
        u"sparse": False,
        u"features": make_feature_list(u"""
                bag_of_words
                bag_of_pos
                bag_of_word_bigrams
                bag_of_wordpos
                bag_of_wordpos_bigrams
                bag_of_words_in_between
                bag_of_pos_in_between
                bag_of_word_bigrams_in_between
                bag_of_wordpos_in_between
                bag_of_wordpos_bigrams_in_between
                entity_order
                entity_distance
                other_entities_in_between
                in_same_sentence
                verbs_count_in_between
                verbs_count
                total_number_of_entities
                symbols_in_between
                number_of_tokens
                BagOfVerbStems True
                BagOfVerbStems False
                BagOfVerbLemmas True
                BagOfVerbLemmas False
        """)
    }
    patch = {u"train_percentage": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
             u"data_shuffle_seed": [u"domino" + str(i) for i in range(10)]}

    xs = [(u"sgd", {}),
          (u"naivebayes", {}),
          (u"naivebayes_m", {}),
          (u"dtree", {u"max_depth": 4, u"min_samples_leaf": 5}),
          (u"logit", {}),
          (u"svm", {}),
          (u"adaboost", {})]
    for classifier, args in xs:
        base[u"classifier"] = classifier
        base[u"classifier_args"] = args
        base[u"scaler"] = True
        if classifier == "naivebayes_m":
            base[u"scaler"] = False
        for config in apply_dict_combinations(base, patch):
            yield config
Пример #3
0
def iter_configs(input_file_path, dbname):
    input_file_path = os.path.abspath(input_file_path)
    hasher = hashlib.md5(open(input_file_path, 'rb').read())
    base = {
        # Experiment configuration
        u'experiment': u'bootstrap',
        u'config_version': u'1',
        u'data_shuffle_seed': "a-ha",
        u'input_file_path': input_file_path,
        u'input_file_md5': hasher.hexdigest(),
        u'database_name': dbname,

        # Human In The Middle configuration
        u'answers_per_round': 5,
        u'max_number_of_rounds': 15,

        # Bootstrap configuration
        u'prediction_config': {
            u'method': u'predic',
            u'scale_to_range': [0.1, 0.9]
        },
        # threshold are expressed as delta to max, so it's uniformly expressed
        # having scaling enabled or not
        u'fact_threshold_distance': 0.01,
        u'evidence_threshold_distance': 0.01,
        u'questions_sorting': 'score',
        u'seed_facts': {
            u'number_to_use': 5,
            u'shuffle': u"it's a trap"
        },

        # Classifier configuration
        u'classifier_config': {}  # to be filled with each candidate-classifier
    }

    prediction_patch = {
        u'method': [u'predict', u'predict_proba'],
        u'scale_to_range': [None, [0.1, 0.9]]
    }
    prediction_options_range = list(
        apply_dict_combinations(base['prediction_config'], prediction_patch)
    )
    seed_patch = {
        u'number_to_use': [5, 10],
        u'shuffle': [u"%i lemon and half lemon" % i for i in range(2)]
    }
    seed_options_range = list(
        apply_dict_combinations(base[u'seed_facts'], seed_patch)
    )

    patch = {
        u'answers_per_round': [5, 15, 25],
        u'prediction_config': prediction_options_range,
        u'fact_threshold_distance': [0.01, 0.05],
        u'evidence_threshold_distance': [0.01, 0.05],
        u'questions_sorting': [u'score', u'certainty'],
        u'seed_facts': seed_options_range
    }

    for classifier_config in candidate_classifiers:
        base[u'classifier_config'] = classifier_config
        for config in apply_dict_combinations(base, patch):
            # Threshold adjustments
            max_score = 1.0
            if config[u'prediction_config']['scale_to_range']:
                max_score = max(config[u'prediction_config']['scale_to_range'])
            config[u'fact_threshold'] = max_score - config.pop(u'fact_threshold_distance')
            config[u'evidence_threshold'] = max_score - config.pop(u'evidence_threshold_distance')
            if (config[u'classifier_config'][u'classifier'] == u'svm' and
                config[u'prediction_config'][u'method'] == u'predict_proba'):
                # we'll split this config in 2 options: actual predict_proba,
                # and decision_function
                config_copied = deepcopy(config)
                config_copied[u'classifier_config'][u'classifier_args'][u'probability'] = True
                yield config_copied
                # http://scikit-learn.org/stable/modules/svm.html#scores-and-probabilities
                config[u'classifier_config'][u'classifier_args'][u'probability'] = False
                config[u'prediction_config'][u'method'] = u'decision_function'
            yield config
Пример #4
0
def iter_configs(input_file_path, dbname):
    input_file_path = os.path.abspath(input_file_path)
    hasher = hashlib.md5(open(input_file_path, "rb").read())
    base = {
        # Experiment configuration
        u"config_version":
        u"1",
        u"data_shuffle_seed":
        None,
        u"train_percentage":
        None,
        u"input_file_path":
        input_file_path,
        u"input_file_md5":
        hasher.hexdigest(),
        u"database_name":
        dbname,

        # Classifier configuration
        u"classifier":
        u"svm",
        u"classifier_args":
        dict(),
        u"dimensionality_reduction":
        None,
        u"dimensionality_reduction_dimension":
        None,
        u"feature_selection":
        None,
        u"feature_selection_dimension":
        1000,
        u"scaler":
        True,
        u"sparse":
        False,
        u"features":
        make_feature_list(u"""
                bag_of_words
                bag_of_pos
                bag_of_word_bigrams
                bag_of_wordpos
                bag_of_wordpos_bigrams
                bag_of_words_in_between
                bag_of_pos_in_between
                bag_of_word_bigrams_in_between
                bag_of_wordpos_in_between
                bag_of_wordpos_bigrams_in_between
                entity_order
                entity_distance
                other_entities_in_between
                in_same_sentence
                verbs_count_in_between
                verbs_count
                total_number_of_entities
                symbols_in_between
                number_of_tokens
                BagOfVerbStems True
                BagOfVerbStems False
                BagOfVerbLemmas True
                BagOfVerbLemmas False
        """)
    }

    # SVM
    ######
    patch = {
        u"train_percentage": [0.05 * x for x in range(1, 11)],
        u"data_shuffle_seed": [u"daddycool" + str(i) for i in range(20)],
        u"feature_selection": [None, "kbest"]
    }
    svm_args_patches = [
        {
            u"kernel": [u"rbf"],
            u"C": [1, 10, 100],
            u"gamma": [0.0, 0.001, 0.0001]
        },
        {
            u"kernel": [u"poly"],
            u"C": [1, 10, 100],
            u"degree": [2, 3, 4],
            u"gamma": [0.0, 0.001, 0.0001]
        },
        {
            u"kernel": [u"linear"],
            u"C": [1, 10, 100]
        },
    ]

    for argpatch in svm_args_patches:
        for argconfig in apply_dict_combinations({}, argpatch):
            base[u"classifier_args"] = argconfig
            for config in apply_dict_combinations(base, patch):
                yield config

    # Adaboost
    ###########

    base.update({
        u"classifier": u"adaboost",
        u"feature_selection_dimension": None,
        u"scaler": False,
    })

    patch = {
        u"train_percentage": [0.05 * x for x in range(1, 11)],
        u"data_shuffle_seed": [u"daddycool" + str(i) for i in range(10)]
    }
    argpatch = {
        u"n_estimators": [5, 10, 20, 50],
        u"learning_rate": [0.9, 1.0, 1.1],
        u"max_depth": [1, 2, 3]
    }
    for argconfig in apply_dict_combinations({}, argpatch):
        base[u"classifier_args"] = argconfig
        for config in apply_dict_combinations(base, patch):
            yield config
Пример #5
0
def iter_configs(input_file_path, dbname):
    input_file_path = os.path.abspath(input_file_path)
    hasher = hashlib.md5(open(input_file_path, 'rb').read())
    base = {
        # Experiment configuration
        u'experiment': u'bootstrap',
        u'config_version': u'4',
        u'data_shuffle_seed': "a-ha",
        u'input_file_path': input_file_path,
        u'input_file_md5': hasher.hexdigest(),
        u'database_name': dbname,

        # Human In The Middle configuration
        u'answers_per_round': 5,
        u'max_number_of_rounds': 15,

        # Bootstrap configuration
        u'prediction_config': {
            u'method': u'decision_function',
            u'scale_to_range': [0.1, 0.9]
        },
        # threshold are expressed as delta to max, so it's uniformly expressed
        # having scaling enabled or not
        u'fact_threshold_distance': 0.01,
        u'evidence_threshold_distance': 0.01,
        u'questions_sorting': u'score',
        u'drop_guesses_each_round': True,
        u'seed_facts': {
            u'number_to_use': 5,
            u'shuffle': u"it's a trap"
        },

        # Classifier configuration
        u'classifier_config': {}  # to be filled with each candidate-classifier
    }

    prediction_patch = {
        u'method': [u'predict', u'predict_proba'],
        u'scale_to_range': [None, [0.1, 0.9]]
    }
    prediction_options_range = list(
        apply_dict_combinations(base['prediction_config'], prediction_patch))
    seed_patch = {
        u'shuffle': [u"%i lemon and half lemon" % i for i in range(2)]
    }
    seed_options_range = list(
        apply_dict_combinations(base[u'seed_facts'], seed_patch))

    patch = {
        u'answers_per_round': [3, 5],
        u'prediction_config': prediction_options_range,
        u'fact_threshold_distance': [0.01, 0.05, 0.1],
        u'evidence_threshold_distance': [0.05, 0.1],
        u'questions_sorting': [u'score', u'certainty'],
        u'seed_facts': seed_options_range
    }

    for classifier_config in candidate_classifiers:
        base[u'classifier_config'] = classifier_config
        for config in apply_dict_combinations(base, patch):
            # Threshold adjustments

            if config[u'fact_threshold_distance'] < config[
                    u'evidence_threshold_distance']:
                # Based on champions of prior rounds, delta for facts shall
                # be greater than delta for evidences.
                # skipping...
                continue

            max_score = 1.0
            if config[u'prediction_config']['scale_to_range']:
                max_score = max(config[u'prediction_config']['scale_to_range'])
            config[u'fact_threshold'] = max_score - config.pop(
                u'fact_threshold_distance')
            config[u'evidence_threshold'] = max_score - config.pop(
                u'evidence_threshold_distance')
            if (config[u'classifier_config'][u'classifier'] == u'svm'
                    and config[u'prediction_config'][u'method']
                    == u'predict_proba'):
                # For SVMs, predict_proba will be replaced with decision_function
                # http://scikit-learn.org/stable/modules/svm.html#scores-and-probabilities
                config[u'classifier_config'][u'classifier_args'][
                    u'probability'] = False
                config[u'prediction_config'][u'method'] = u'decision_function'
                # Also, given that decision_function  with rbf doest run on sparse matrixes
                if config[u'classifier_config'][u'classifier_args'].get(
                        'kernel', '') == 'rbf':
                    config[u'classifier_config'][u'sparse'] = False
            yield config
Пример #6
0
def iter_configs(input_file_path, dbname):
    input_file_path = os.path.abspath(input_file_path)
    hasher = hashlib.md5(open(input_file_path, "rb").read())
    base = {
        # Experiment configuration
        u"config_version":
        u"1",
        u"data_shuffle_seed":
        None,
        u"train_percentage":
        None,
        u"input_file_path":
        input_file_path,
        u"input_file_md5":
        hasher.hexdigest(),
        u"database_name":
        dbname,

        # Classifier configuration
        u"classifier":
        u"svm",
        u"classifier_args":
        dict(),
        u"dimensionality_reduction":
        None,
        u"dimensionality_reduction_dimension":
        None,
        u"feature_selection":
        None,
        u"feature_selection_dimension":
        None,
        u"scaler":
        True,
        u"sparse":
        False,
        u"features":
        make_feature_list(u"""
                bag_of_words
                bag_of_pos
                bag_of_word_bigrams
                bag_of_wordpos
                bag_of_wordpos_bigrams
                bag_of_words_in_between
                bag_of_pos_in_between
                bag_of_word_bigrams_in_between
                bag_of_wordpos_in_between
                bag_of_wordpos_bigrams_in_between
                entity_order
                entity_distance
                other_entities_in_between
                in_same_sentence
                verbs_count_in_between
                verbs_count
                total_number_of_entities
                symbols_in_between
                number_of_tokens
                BagOfVerbStems True
                BagOfVerbStems False
                BagOfVerbLemmas True
                BagOfVerbLemmas False
        """)
    }

    # RBF
    ######
    patch = {
        u"train_percentage": [0.07 * x for x in range(1, 11)],
        u"data_shuffle_seed": [u"sussieq" + str(i) for i in range(20)]
    }
    argpatch = {
        u"kernel": [u"rbf"],
        u"gamma": [0.0, 1e-4, 1e-5, 1e-6],
        u"class_weight": [{
            True: 10,
            False: 1
        }, {
            True: 1,
            False: 10
        }, {
            True: 1,
            False: 1
        }, {
            True: 1,
            False: 0.1
        }, {
            True: 0.1,
            False: 1
        }]
    }

    for argconfig in apply_dict_combinations({}, argpatch):
        base[u"classifier_args"] = argconfig
        for config in apply_dict_combinations(base, patch):
            yield config

    # POLY
    #######

    base[u"feature_selection"] = "kbest"
    patch = {
        u"train_percentage": [0.07 * x for x in range(1, 11)],
        u"data_shuffle_seed": [u"sussieq" + str(i) for i in range(20)],
        u"feature_selection_dimension": [500, 1000, 2000, 4000]
    }

    argpatch = {
        u"kernel": [u"poly"],
        u"degree": [4],
        u"gamma": [0.0, 1e-4, 1e-5, 1e-6],
        u"class_weight": [{
            True: 10,
            False: 1
        }, {
            True: 1,
            False: 10
        }, {
            True: 1,
            False: 1
        }, {
            True: 1,
            False: 0.1
        }, {
            True: 0.1,
            False: 1
        }]
    }

    for argconfig in apply_dict_combinations({}, argpatch):
        base[u"classifier_args"] = argconfig
        for config in apply_dict_combinations(base, patch):
            yield config