Exemplo n.º 1
0
    def test_kfolds_eval(self, binary_problem=False):
        __labeled_inp_df = self.labeled_inp_df.copy(deep=True)
        __labeled_binary_inp_df = self.labeled_binary_inp_df.copy(deep=True)
        __schema_with_label = self.schema_with_label.copy()

        ac = Classifier(model_configuration=testModelConfiguration)

        if binary_problem:
            res_df = ac.eval(input_df=__labeled_binary_inp_df,
                             schema=__schema_with_label,
                             mode="K_FOLDS",
                             nfolds=3)
        else:
            res_df = ac.eval(input_df=__labeled_inp_df,
                             schema=__schema_with_label,
                             mode="K_FOLDS",
                             nfolds=3)

        self.assertTrue(isinstance(res_df, pd.DataFrame))
        self.assertEqual(res_df.dtypes[0], "object")

        if binary_problem:
            self.assertEqual(len(res_df.columns),
                             max(1 + len(self.binary_labels), 5))
        else:
            self.assertEqual(len(res_df.columns), max(1 + len(self.labels), 5))
Exemplo n.º 2
0
    def test_numclasses(self):
        __labeled_inp_df = self.labeled_inp_df.copy(deep=True)
        __schema_with_label = self.schema_with_label.copy()

        ac = Classifier(model_configuration=testModelConfiguration)
        ac.train(input_df=__labeled_inp_df, schema=__schema_with_label)

        nclasses = ac.num_classes()

        self.assertTrue(nclasses == len(self.labels))
Exemplo n.º 3
0
    def test_predict_proba(self):
        __labeled_inp_df = self.labeled_inp_df.copy(deep=True)
        __nonlabeled_inp_df = self.nonlabeled_inp_df.copy(deep=True)
        __schema_with_label = self.schema_with_label.copy()

        ac = Classifier(model_configuration=testModelConfiguration)
        ac.train(input_df=__labeled_inp_df, schema=__schema_with_label)
        lr, fm, lm = ac.get_models()

        ac = Classifier(model_configuration=testModelConfiguration)
        ac.load_models(lr, fm, lm)

        for multilbl_pred in [True, False]:
            res_df = ac.predict_proba(input_df=__nonlabeled_inp_df,
                                      multilabel_pred=multilbl_pred)

            self.assertTrue(isinstance(res_df, pd.DataFrame))
            self.assertEqual(len(res_df.columns),
                             len(self.fields_without_label) + 3)
            self.assertEqual(res_df.dtypes[-1], "float64")
            self.assertEqual(res_df.dtypes[-2], "object")
            self.assertEqual(res_df.dtypes[-3], "object")
            self.assertEqual(len(res_df), self.num_recs)

            if not multilbl_pred:
                self.assertFalse(
                    any(
                        list(
                            map(lambda x: x[0] not in self.labels,
                                res_df.filter([res_df.columns[-3]]).values))))
            else:
                list_lbls = list(
                    map(lambda lbls: lbls[0].split(","),
                        res_df.filter([res_df.columns[-3]]).values))
                list_valid_lbls = list(
                    map(
                        lambda lbls: map(lambda lbl: lbl not in self.labels,
                                         lbls), list_lbls))
                self.assertFalse(any(list(map(any, list_valid_lbls))))

            #Test if probabilities sum-up to 1
            prob_str = list(
                map(lambda p_str: p_str.split(','),
                    res_df["Probabilities"].values))
            prob_float = list(
                map(
                    lambda prob_with_label:
                    [float(p.split(':')[1]) for p in prob_with_label],
                    prob_str))

            self.assertFalse(any(list(map(lambda probs: sum(probs) >= 1.0 + 0.005 * len(self.fields_without_label) \
                                                or sum(probs) <= 1.0 - 0.005 * len(self.fields_without_label), prob_float))))
Exemplo n.º 4
0
    def test_training(self):
        __labeled_inp_df = self.labeled_inp_df.copy(deep=True)
        __schema_with_label = self.schema_with_label.copy()

        ac = Classifier(model_configuration=testModelConfiguration)
        ac.train(input_df=__labeled_inp_df, schema=__schema_with_label)
        lr, fm, lm = ac.get_models()

        self.assertTrue(
            isinstance(lr, LR) or isinstance(lr, LSVC)
            or isinstance(lr, Ensemble))
        self.assertTrue(isinstance(fm, Featurizer))
        self.assertTrue(isinstance(lm, Featurizer))
Exemplo n.º 5
0
def top_rfe_features(labeled_dataset, config, topN = None):
    labeled_inp_df = datasetToDataframe(labeled_dataset)
    features = defaultFeatures(dataset=labeled_dataset)
    featurizers = defaultFeaturizers(features)

    stop_words = ENGLISH_STOP_WORDS if config["stopwords"] == "ENGLISH" else []

    tokenizer = BaseTokenizer() if config["tokenizer"] == "WORD_TOKENIZER" \
        else PorterTokenizer() if config["tokenizer"] == "STEMMER" \
        else LemmaTokenizer()  if config["tokenizer"] == "LEMMATIZER" \
        else None

    ngram_range = (1, 1) if config["ngrams"] == "UNIGRAM" \
        else (2, 2) if config["ngrams"] == "BIGRAM" \
        else (1, 2) if config["ngrams"] == "BOTH" \
        else None


    ac = Classifier(model_configuration={
        "type": config['type'],
        "class_weight": config['weighting'].lower(),
        "tokenizer": tokenizer,
        "ngram_range": ngram_range,
        "sublinear_tf": config['tf']=="SUBLINEAR",
        "smooth_idf": config['df']=="SMOOTH",
        "penalty": config['penalty'].lower(),
        "multi_class": config['multiclass'].lower(),
        "solver": config['solver'].lower(),
        "dual": config['primal_dual']=="DUAL",
        "fit_intercept": config['fitIntercept'],
        'max_df':  config['max_df'],
        'min_df':  config['min_df'],
        'stopwords': stop_words,
        'C': config['C'],
        'max_iter': config['max_iter']
    })

    res_df = ac.feature_ranking(input_df=labeled_inp_df, schema=featurizers, mode=Classifier.CC_fs_backward)

    feature_names = pd.Series(map(lambda fname: fname.split('::')[0], res_df['Feature']))
    feature_scores = pd.concat([feature_names, res_df['Score']], axis=1)
    feature_scores.columns = ['Feature', 'Score']
    feature_sum_scores = feature_scores.groupby('Feature').sum()
    sorted_features = feature_sum_scores.sort_values(by = ["Score"], ascending = False)

    selected_feature_names = list(sorted_features.index)[:topN]
    selected_features = []
    for fname in selected_feature_names:
        selected_features  += [feat for feat in features if feat['name'] == fname]

    return selected_features+ [features[-1]]
Exemplo n.º 6
0
    def test_LOO_eval_table_format(self):
        __labeled_inp_df = self.labeled_inp_df.copy(deep=True)
        __schema_with_label = self.schema_with_label.copy()

        ac = Classifier(model_configuration=testModelConfiguration)

        res_df = ac.eval(input_df=__labeled_inp_df,
                         schema=__schema_with_label,
                         mode="LEAVE_ONE_OUT",
                         nfolds=3)

        self.assertTrue(isinstance(res_df, pd.DataFrame))
        self.assertEqual(len(res_df.columns), max(1 + len(self.labels), 5))
        self.assertEqual(res_df.dtypes[0], "object")
Exemplo n.º 7
0
    def test_correlation_feature_selection(self):
        __labeled_inp_df = self.labeled_inp_df.copy(deep=True)
        __schema_with_label_nonnegative = self.schema_with_label_nonnegative.copy(
        )

        ac = Classifier(model_configuration=testModelConfiguration)
        res_df = ac.feature_ranking(input_df=__labeled_inp_df,
                                    schema=__schema_with_label_nonnegative,
                                    mode=Classifier.CC_fs_correlation)

        self.assertTrue(isinstance(res_df, pd.DataFrame))
        self.assertEqual(len(res_df.columns), 2)
        self.assertEqual(res_df.dtypes[0], "object")
        self.assertIn(res_df.dtypes[1], ["int64", "float64"])
Exemplo n.º 8
0
    def test_labels(self):
        __labeled_inp_df = self.labeled_inp_df.copy(deep=True)
        __schema_with_label = self.schema_with_label.copy()

        ac = Classifier(model_configuration=testModelConfiguration)
        ac.train(input_df=__labeled_inp_df, schema=__schema_with_label)

        labels = ac.labels()

        diff1 = [elem for elem in labels if elem not in self.labels]
        diff2 = [elem for elem in self.labels if elem not in labels]

        self.assertTrue(len(diff1) == 0)
        self.assertTrue(len(diff2) == 0)
        self.assertTrue(len(labels) == len(self.labels))
Exemplo n.º 9
0
def create_classifier(config):

    return Classifier(model_configuration={
        'id': id(),
        "type": config["type"],
        "class_weight": None if config['weighting'].lower() == 'none' else config['weighting'].lower(),
        "tokenizer": BaseTokenizer() if config["tokenizer"] == "WORD_TOKENIZER" \
            else PorterTokenizer() if config["tokenizer"] == "STEMMER" \
            else LemmaTokenizer()  if config["tokenizer"] == "LEMMATIZER" \
            else None,
        "ngram_range": (1, 1) if config["ngrams"] == "UNIGRAM" \
            else (2, 2) if config["ngrams"] == "BIGRAM" \
            else (1, 2) if config["ngrams"] == "BOTH" \
            else None,
        "sublinear_tf": config["tf"] == "SUBLINEAR",
        "smooth_idf": config["df"] == "SMOOTH",
        "penalty": config['penalty'].lower(),
        "multi_class": config['multiclass'].lower(),
        "solver": config['solver'].lower(),
        "dual": config['primal_dual']=='DUAL',
        "fit_intercept": config['fitIntercept'],
        'max_df': config['max_df'],
        'min_df': config['min_df'],
        'stopwords': ENGLISH_STOP_WORDS if config["stopwords"] == "ENGLISH" else [],
        'C': config['C'],
        'max_iter': config['max_iter']
    })
Exemplo n.º 10
0
    def test_learn(self):
        __labeled_inp_df = self.labeled_inp_df.copy(deep=True)
        __nonlabeled_inp_df = self.nonlabeled_inp_df.copy(deep=True)
        __schema_with_label = self.schema_with_label.copy()

        ac = Classifier(model_configuration=testModelConfiguration)
        ac.train(input_df=__labeled_inp_df, schema=__schema_with_label)
        lr, fm, lm = ac.get_models()

        ac = Classifier(model_configuration=testModelConfiguration)
        ac.load_models(lr, fm, lm)

        res_df = ac.learn(input_df=__nonlabeled_inp_df)

        self.assertTrue(isinstance(res_df, pd.DataFrame))
        self.assertEqual(len(res_df.columns),
                         len(self.fields_without_label) + 3)
        self.assertEqual(res_df.dtypes[-1], "float64")
Exemplo n.º 11
0
    def test_backward_feature_selection(self):
        if testModelConfiguration['type'] in [
                Classifier.ENSEMBLE_SVC_MODEL_TYPE,
                Classifier.ENSEMBLE_LR_MODEL_TYPE
        ]:
            return

        __labeled_inp_df = self.labeled_inp_df.copy(deep=True)
        __schema_with_label = self.schema_with_label.copy()

        ac = Classifier(model_configuration=testModelConfiguration)
        res_df = ac.feature_ranking(input_df=__labeled_inp_df,
                                    schema=__schema_with_label,
                                    mode=Classifier.CC_fs_backward)

        self.assertTrue(isinstance(res_df, pd.DataFrame))
        self.assertEqual(len(res_df.columns), 2)
        self.assertEqual(res_df.dtypes[0], "object")
        self.assertIn(res_df.dtypes[1], ["int64", "float64"])
Exemplo n.º 12
0
    def test_eval_data(self, binary_problem=False):
        __labeled_inp_df = self.labeled_inp_df.copy(deep=True)
        __labeled_binary_inp_df = self.labeled_binary_inp_df.copy(deep=True)
        __schema_with_label = self.schema_with_label.copy()

        ac = Classifier(model_configuration=testModelConfiguration)

        if binary_problem:
            labels, true_lbls, pred_lbls, conf_mat, cls_report = ac.eval_data(
                input_df=__labeled_binary_inp_df,
                schema=__schema_with_label,
                mode="LEAVE_ONE_OUT",
                nfolds=3)
        else:
            labels, true_lbls, pred_lbls, conf_mat, cls_report = ac.eval_data(
                input_df=__labeled_inp_df,
                schema=__schema_with_label,
                mode="LEAVE_ONE_OUT",
                nfolds=3)

        if binary_problem:
            self.assertTrue(len(labels) == 2)
        else:
            self.assertTrue(len(labels) == len(self.labels))

        self.assertTrue(len(true_lbls) == self.num_recs)
        self.assertTrue(len(true_lbls) == len(pred_lbls))

        self.assertTrue(len(conf_mat) == len(labels))
        self.assertTrue(len(conf_mat[0]) == len(labels))

        ext_labels = list(labels) + ['macro avg', 'weighted avg']
        for lbl in ext_labels:
            self.assertTrue(lbl in cls_report.keys())
            self.assertTrue('precision' in cls_report[lbl])
            self.assertTrue('recall' in cls_report[lbl])
            self.assertTrue('f1-score' in cls_report[lbl])
            self.assertTrue('support' in cls_report[lbl])

        self.assertTrue('accuracy' in cls_report.keys())
Exemplo n.º 13
0
    def test_model_visualization(self, binary_problem=False):
        __labeled_inp_df = self.labeled_inp_df.copy(deep=True)
        __labeled_binary_inp_df = self.labeled_binary_inp_df.copy(deep=True)
        __schema_with_label = self.schema_with_label.copy()

        ac = Classifier(model_configuration=testModelConfiguration)

        if binary_problem:
            ac.train(input_df=__labeled_binary_inp_df,
                     schema=__schema_with_label)
        else:
            ac.train(input_df=__labeled_inp_df, schema=__schema_with_label)
        lr, fm, lm = ac.get_models()

        ac = Classifier(model_configuration=testModelConfiguration)
        ac.load_models(lr, fm, lm)

        res_df = ac.model_visualization()
        self.assertTrue(isinstance(res_df, pd.DataFrame))
        self.assertEqual(len(res_df.columns), 3)
        self.assertEqual(res_df.dtypes[-1], "float64")
        self.assertEqual(res_df.dtypes[-2], "object")
        self.assertEqual(res_df.dtypes[-3], "object")
Exemplo n.º 14
0
    def test_predict_explain(self, binary_problem=False):
        __labeled_inp_df = self.labeled_inp_df.copy(deep=True)
        __nonlabeled_inp_df = self.nonlabeled_inp_df.copy(deep=True)
        __labeled_binary_inp_df = self.labeled_binary_inp_df.copy(deep=True)
        __schema_with_label = self.schema_with_label.copy()

        ac = Classifier(model_configuration=testModelConfiguration)

        if binary_problem:
            ac.train(input_df=__labeled_binary_inp_df,
                     schema=__schema_with_label)
        else:
            ac.train(input_df=__labeled_inp_df, schema=__schema_with_label)
        lr, fm, lm = ac.get_models()

        ac = Classifier(model_configuration=testModelConfiguration)
        ac.load_models(lr, fm, lm)

        res_df = ac.predict_explain(input_df=__nonlabeled_inp_df)

        self.assertTrue(isinstance(res_df, pd.DataFrame))
        self.assertEqual(len(res_df.columns),
                         len(self.fields_without_label) + 4)
        self.assertEqual(res_df.dtypes[-1], "object")
        self.assertEqual(res_df.dtypes[-2], "float64")
        self.assertEqual(res_df.dtypes[-3], "object")
        self.assertEqual(res_df.dtypes[-4], "object")

        #Test if all top-contributed features are present
        def chk_contributor_existance(row):
            contributors = row["TopContributors"].split(';')
            features = [contrib.split('=')[0] for contrib in contributors]
            for feat in features:
                if '::' in feat:
                    field_name, field_value = feat.split('::')
                    self.assertIn(field_name, self.fields_without_label)
                    fld_no = list(res_df.columns).index(field_name)
                    if self.schema_without_label[fld_no] in ["text", "set"]:
                        #self.assertIn(field_value, row[field_name].lower())
                        self.assertTrue(True)
                    elif self.schema_without_label[fld_no] in [
                            "string", "numeric", "boolean"
                    ]:
                        self.assertEqual(field_value, row[field_name])
                else:
                    field_name = feat
                    if len(field_name) > 0:
                        self.assertIn(field_name, self.fields_without_label)

        res_df.apply(chk_contributor_existance, axis=1)
Exemplo n.º 15
0
    def test_input_qlty(self, binary_problem=False):
        __labeled_inp_df = self.labeled_inp_df.copy(deep=True)
        __labeled_binary_inp_df = self.labeled_binary_inp_df.copy(deep=True)
        __nonlabeled_inp_df = self.nonlabeled_inp_df.copy(deep=True)
        __schema_with_label = self.schema_with_label.copy()

        ac = Classifier(model_configuration=testModelConfiguration)

        if binary_problem:
            ac.train(input_df=__labeled_binary_inp_df,
                     schema=__schema_with_label)
        else:
            ac.train(input_df=__labeled_inp_df, schema=__schema_with_label)
        lr, fm, lm = ac.get_models()

        ac = Classifier(model_configuration=testModelConfiguration)
        ac.load_models(lr, fm, lm)

        res_df = ac.input_qlty(input_df=__nonlabeled_inp_df)

        self.assertTrue(isinstance(res_df, pd.DataFrame))
        self.assertEqual(len(res_df.columns),
                         len(self.fields_without_label) + 2)
        self.assertEqual(res_df.dtypes[-1], "object")
        self.assertEqual(res_df.dtypes[-2], "object")
        self.assertFalse(
            any(
                list(
                    map(lambda x: x not in ["Good", "Bad", "OK"],
                        res_df.filter([res_df.columns[-2]]).values))))

        #Test if all suggested features are not present
        def chk_feature_nonexistance(row):
            suggested_features = row["SuggestedFeatures"].split(',')
            for feat in suggested_features:
                if '::' in feat:
                    field_name, field_value = feat.split('::')
                    self.assertIn(field_name, self.fields_without_label)
                    fld_no = list(res_df.columns).index(field_name)
                    if self.schema_without_label[fld_no] in [
                            "text", "text2vec"
                    ]:
                        self.assertNotIn(' ' + field_value + ' ',
                                         row[field_name].lower())
                        #self.assertTrue(True)
                    elif self.schema_without_label[fld_no] == "set":
                        if len(field_value) > 0:
                            self.assertNotIn(field_value, row[field_name])
                    elif self.schema_without_label[fld_no] in [
                            "string", "numeric", "boolean"
                    ]:
                        self.assertNotEqual(field_value, row[field_name])
                else:
                    field_name = feat
                    if len(field_name) > 0:
                        self.assertIn(field_name, self.fields_without_label)

        res_df.apply(chk_feature_nonexistance, axis=1)
Exemplo n.º 16
0
def classify(cachedModelID, data):
    startedTime = datetime.datetime.now()
    assert(cachedModelID in cachedMSR), "Model not found."
    model = cachedMSR[cachedModelID]['selectedModel']

    emptyResults = {
        'id': -1,
        'classSummaries': []
    }

    #debug
    print('Received a dataset with ', len(data['features']), ' features to classify.')
    if (len(data['features']) ==0):
        print('There is no feature, empty result set is returned.')
        return emptyResults
    print('Received a dataset with ', len(data['features'][0]['data']), ' rows to classify.')
    if (len(data['features'][0]['data']) ==0):
        print('There is no data, empty result set is returned.')
        return emptyResults

    candidate = model["candidate"]
    features = candidate["features"]
    config = candidate["config"]

    unlabeled_df = datasetToDataframe(data)
    filtered_input_df = unlabeled_df.filter([f['name'] for f in features])

    lr, fm, lm = loadTrainedModel(model)

    ac = Classifier(model_configuration=config)
    ac.load_models(lr, fm, lm)

    res_df = ac.predict_explain(input_df=filtered_input_df, topN_features=10)
    reccom_df = ac.input_qlty(input_df=filtered_input_df, topN=10)
    res_df = pd.concat([res_df, reccom_df.filter(["SuggestedFeatures"])], axis=1)

    plCountSeries = res_df.groupby('PredictedLabel').PredictedLabel.count()
    labels = list(plCountSeries.keys())

    classSummaries = []

    for label in labels:
        filtered_res_df = res_df[res_df.PredictedLabel == label]
        entropies = []
        probabilities = []
        results = []
        for data_index, row in filtered_res_df.iterrows():
            entropies.append(float(row.Entropy))
            probsDict, allLabels = unpackProbs(row.Probabilities)
            probabilities.append(float(probsDict[label]))
            contributors = unpackContribs(row.TopContributors)
            recommends = unpackSuggestedFeatures(row.SuggestedFeatures)

            input_data = []
            for feat in data['features']:
                input_data.append({'id': id(), 'feature': feat['feature'], 'data': [feat['data'][data_index]]})
            data_instance = {
                'id': id(),
                'dataset': { 'id': id(),
                             'features': input_data},
                'index': data_index
            }

            classificationResult = {
                'id': id(),
                'allLabels': allLabels,
                'entropy': float(row.Entropy),
                'contributors': contributors,
                'dataInstance': data_instance,
                'predictedLabel': {
                    'id': id(),
                    'label': label,
                    'probability': float(probsDict[label])
                },
                'recommends': recommends
            }

            results.append(classificationResult)
        
        classSumary = {
            'id': id(),
            'label': label,
            'numInstances': int(plCountSeries[label]),
            'probabilities': probabilities,
            'entropies': entropies,
            'results': results
        }

        classSummaries.append(classSumary)

    batchClassificationResult = {
        'id': id(),
        "classSummaries": classSummaries
    }

    print('Classification time:' + str((datetime.datetime.now() - startedTime).total_seconds()) + ' seconds ')

    return batchClassificationResult