Exemplo n.º 1
0
    def do_iteration(i):
        print("Iteration " + str(i+1))
        dev_df = train.copy(deep=True)
        test_df = test.copy(deep=True)
        rng = np.random.RandomState()
        rng.seed(seeds[i])
        folds_i = iterative_stratification(dev_df, labels, cv_folds, rng)

        # Create the appropriate statistics container for this iteration.
        validation_stats_i = Statistics()
        testing_stats_i = Statistics()

        for train_idx, valid_idx in cv_iterator(folds_i):
            training_fold = dev_df.loc[train_idx, ]
            validation_fold = dev_df.loc[valid_idx, ]

            # shuffle the folds
            training_fold = training_fold.reindex(rng.permutation(training_fold.index))
            validation_fold = validation_fold.reindex(rng.permutation(validation_fold.index))

            stats_valid, stats_test = multi_label_crf(
                labels=labels,
                df_train=training_fold,
                df_valid=validation_fold,
                df_test=test_df,
                binary=binary,
                connectivity='full',
                vectorizer_method=vectorizer_method
            )

            validation_stats_i.merge(stats_valid)
            testing_stats_i.merge(stats_test)

        log.write('Iteration {}\n'.format(i))
        validation_stats_i.write(log, 'a')
        testing_stats_i.write(log, 'a')
        return validation_stats_i, testing_stats_i
Exemplo n.º 2
0
        def do_fold(j):
            print("\tFold " + str(j+1))
            train_idx = folds_i[j][0]
            valid_idx = folds_i[j][1]
            training_fold = developement_df.loc[train_idx, ]
            training_fold = training_fold.reset_index(drop=True)
            validation_fold = developement_df.loc[valid_idx, ]
            validation_fold = validation_fold.reset_index(drop=True)

            # shuffle the folds
            training_stats_i_f = Statistics()
            validation_stats_i_f = Statistics()
            testing_stats_i_f = Statistics()

            # Init the label ranking lists.
            label_pred_proba_train = []
            label_pred_proba_valid = []
            label_pred_proba_test = []

            label_y_train = []
            label_y_valid = []
            label_y_test = []

            # Set up the vectorizer for the bag-of-words representation
            if vectorizer_method == 'tf-idf':
                vectorizer = TfidfVectorizer(
                    stop_words=['go', '', ' '], binary=binary, lowercase=True,
                    sublinear_tf=True, max_df=1.0, min_df=0
                )
                vectorizer.fit(training_fold['terms'].values)
                alpha = None
                percentile = 100
            elif vectorizer_method == 'count':
                vectorizer = CountVectorizer(
                    stop_words=['go', '', ' '], binary=binary, lowercase=True
                )
                vectorizer.fit(training_fold['terms'].values)
                alpha = None
                percentile = 100
            else:
                raise TypeError("Vectorizer_method has type {}.".format(type(vectorizer_method)))

            selectors = generate_selectors(selection, vectorizer.get_feature_names(), dag)
            base_estimators = make_classifiers(method, balanced, labels, selectors, selection, rng)
            for label in sorted(labels):
                print("\t\tFitting for label {}...".format(label))

                # SVMs make the assumption of standardised features. Hence we scale the features
                # avoiding the use of mean to maintain the structure of count sparsity. Scaling
                # May also help with linear model convergence speed.
                x_train_l = vectorizer.transform(training_fold['terms'].values)
                y_train_l = np.asarray(training_fold[label].values, dtype=int)

                x_valid_l = vectorizer.transform(validation_fold['terms'].values)
                y_valid_l = np.asarray(validation_fold[label].values, dtype=int)

                x_test_l = vectorizer.transform(testing_df['terms'].values)
                y_test_l = np.asarray(test_df_i[label].values, dtype=int)

                if scale:
                    x_train_l = mean_center(x_train_l, with_mean=False)
                    x_valid_l = mean_center(x_valid_l, with_mean=False)
                    x_test_l = mean_center(x_test_l, with_mean=False)

                # We generate the folds for randomised search up-front. We hold out one of the folds for
                # Probability calibration so each sampled param set gets calibrated on the same data.
                # This leaves cv_folds-2 folds for randomised search cross-validation.
                # cv_rand = StratifiedKFold(n_splits=3, shuffle=True, random_state=rng)
                base_estimator_l = base_estimators[label]
                fresh_estimator = clone(base_estimator_l)

                # Find the best params, then do a final proper calibration.
                params = sk_generate_params(method, selection)
                estimator_l = RandomizedSearchCV(
                    estimator=base_estimator_l, param_distributions=params,
                    n_iter=60, scoring='f1', cv=3, random_state=rng,
                    error_score=0.0, n_jobs=1, pre_dispatch='2*n_jobs',
                    refit=True
                )

                # Test if there's any signal if we permute the labels.
                # Classifier should do poorly if we do so.
                if permute:
                    y_train_l = rng.permutation(y_train_l)

                threshold = 0.5
                estimator_l.fit(x_train_l, y_train_l)
                best_params_l = estimator_l.best_params_

                # Calibrate the random forest with the best hyperparameters.
                if method not in ['lr']:
                    estimator_l = CalibratedClassifierCV(fresh_estimator.set_params(**best_params_l),
                                                         cv=3, method='sigmoid')
                    estimator_l.fit(x_train_l, y_train_l)

                # Evaluate Performance characteristics and test on training to check overfitting.
                y_train_prob_l = estimator_l.predict_proba(x_train_l)
                y_valid_prob_l = estimator_l.predict_proba(x_valid_l)
                y_test_prob_l = estimator_l.predict_proba(x_test_l)
                training_stats_i_f.merge(evaluate_model(y_train_l, y_train_prob_l, label, threshold))
                validation_stats_i_f.merge(evaluate_model(y_valid_l, y_valid_prob_l, label,threshold))

                # Compute independent test data performance
                testing_stats_i_f.merge(evaluate_model(y_test_l, y_test_prob_l, label, threshold))

                # Get label ranking info
                label_pred_proba_train.append([p[1] for p in y_train_prob_l])
                label_pred_proba_valid.append([p[1] for p in y_valid_prob_l])
                label_pred_proba_test.append([p[1] for p in y_test_prob_l])

                label_y_train.append(y_train_l)
                label_y_valid.append(y_valid_l)
                label_y_test.append(y_test_l)

                print(validation_stats_i_f.frame())

            # Compute multi-label performance statistics
            y = np.vstack(zip(*label_y_train))
            y_prob = np.vstack(zip(*label_pred_proba_train))
            training_stats_i_f.merge(multi_label_evaluate(y, y_prob, threshold))

            y = np.vstack(zip(*label_y_valid))
            y_prob = np.vstack(zip(*label_pred_proba_valid))
            validation_stats_i_f.merge(multi_label_evaluate(y, y_prob, threshold))

            y = np.vstack(zip(*label_y_test))
            y_prob = np.vstack(zip(*label_pred_proba_test))
            testing_stats_i_f.merge(multi_label_evaluate(y, y_prob, threshold))

            return training_stats_i_f, validation_stats_i_f, testing_stats_i_f
Exemplo n.º 3
0
            training_stats_i_f.merge(multi_label_evaluate(y, y_prob, threshold))

            y = np.vstack(zip(*label_y_valid))
            y_prob = np.vstack(zip(*label_pred_proba_valid))
            validation_stats_i_f.merge(multi_label_evaluate(y, y_prob, threshold))

            y = np.vstack(zip(*label_y_test))
            y_prob = np.vstack(zip(*label_pred_proba_test))
            testing_stats_i_f.merge(multi_label_evaluate(y, y_prob, threshold))

            return training_stats_i_f, validation_stats_i_f, testing_stats_i_f

        # For each iteration, batch the folds into parallel jobs
        statistics_objects_i = parallel_map(do_fold, range(cv_folds), n_jobs)
        for (train, val, test) in statistics_objects_i:
            training_stats_i.merge(train)
            validation_stats_i.merge(val)
            testing_stats_i.merge(test)

        log.write('Iteration {}\n'.format(i))
        log.write('Training {}\n'.format(i))
        training_stats_i.write(log, 'a')
        log.write('Validation {}\n'.format(i))
        validation_stats_i.write(log, 'a')
        log.write('Testing {}\n'.format(i))
        testing_stats_i.write(log, 'a')

        statistics_objects.append([training_stats_i, validation_stats_i, testing_stats_i])
        # return training_stats_i, validation_stats_i, testing_stats_i

    # containers = parallel_map(do_iteration, range(iterations), n_jobs=n_jobs)
Exemplo n.º 4
0
def multi_label_crf(
        labels,
        df_train,
        df_valid,
        df_test,
        binary,
        connectivity='full',
        vectorizer_method='count'
    ):
    """
    Do you suffer from acute label correlations? Are your samples a part of more than
    one class? Do you have signs of labels have dependency? If you answered yes to at least
    one of those questions then sign up for structured learning today. For a low monthly
    membership fee of $39.99 you can solve all your multi-label woes!

    @param labels:
    @param df_train:
    @param df_valid:
    @param df_test:
    @param connectivity:
    @param vectorizer_method:

    @return:
    """
    stats_container_valid = Statistics()
    stats_container_test = Statistics()

    if vectorizer_method == 'tf-idf':
        vectorizer_node = TfidfVectorizer(
            stop_words=['go:', '', ' '], binary=binary, lowercase=True, sublinear_tf=False, max_df=1.0, min_df=0)
        vectorizer_node.fit(df_train['terms'].values)
        alpha = None
        percentile = 100
    elif vectorizer_method == 'count':
        vectorizer_node = CountVectorizer(stop_words=['go', '', ' '], binary=binary, lowercase=True)
        vectorizer_node.fit(df_train['terms'].values)
        alpha = None
        percentile = 100
    else:
        raise TypeError("Vectorizer_method has type {}.".format(type(vectorizer_method)))

    x_node_train, y_train, feature_names, selector_node = prep.select_features(
        df = df_train,
        vectorizer=vectorizer_node,
        feature_col='terms',
        label_col='label',
        select_method=None,
        continuous_col=[],
        alpha=alpha,
        percentile=percentile
    )
    x_node_valid, y_valid = prep.transform_features(
        df=df_valid,
        vectorizer=vectorizer_node,
        selector=selector_node,
        feature_col='terms',
        label_col='label',
        continuous_cols=[]
    )

    y_train = np.asarray([prep.binarise_labels(x, labels) for x in y_train], dtype=int)
    y_valid = np.asarray([prep.binarise_labels(x, labels) for x in y_valid], dtype=int)

    if connectivity == 'full' or connectivity == 'tree':
        n_labels = len(labels)
        edges = np.vstack([x for x in itertools.combinations(range(n_labels), 2)])
        model = MultiLabelClf(n_labels=len(labels), edges=edges, inference_method='ad3')
    elif connectivity == 'tree':
        edges = chow_liu_tree(y_train)
        model = MultiLabelClf(n_labels=len(labels), edges=edges, inference_method='max-product')
    else:
        edges = None
        model = MultiLabelClf(n_labels=len(labels), edges=edges, inference_method='unary')
    x_train = x_node_train.toarray()
    x_valid = x_node_valid.toarray()

    # -------------------- MAKE THE ESTIMATOR -------------------- #
    estimator = OneSlackSSVM(model, max_iter=2, tol=0.001, n_jobs=1)

    # -------------------- LEARN/STATS -------------------- #
    estimator.fit(x_train, y_train)
    stats_container_valid.merge(evaluate_crf_model(x_valid, y_valid, estimator, labels))

    if isinstance(df_test, pd.DataFrame):
        x_node_test, y_test = prep.transform_features(
            df=df_test,
            vectorizer=vectorizer_node,
            selector=selector_node,
            feature_col='terms',
            label_col='label',
            continuous_cols=[]
        )
        y_test = np.asarray([prep.binarise_labels(x, labels) for x in y_test], dtype=int)
        x_test = x_node_test.toarray()
        stats_container_test.merge(evaluate_crf_model(x_test, y_test, estimator, labels))

    # -------------------- RETURN -------------------- #
    if isinstance(df_test, pd.DataFrame):
        return stats_container_valid, stats_container_test
    else:
        return stats_container_valid
Exemplo n.º 5
0
    def do_iteration(data):
    # for i in xrange(iterations):
        i = data[0]
        print("Iteration " + str(i+1))
        train_i = train.copy(deep=True)
        rng = np.random.RandomState()
        rng.seed(seeds[i])
        su_make_dir('llda/models/iteration-{}'.format(i+1))
        folds_i = iterative_stratification(train_i, labels, cv_folds, rng)
        combinations = itertools.combinations(range(0, cv_folds), cv_folds-1)

        # Create the appropriate statistics container for this iteration.
        validation_stats_i = Statistics()
        testing_stats_i = Statistics()

        for n, j in enumerate(combinations):
            # print('\tFold ' + str(n+1))
            su_make_dir('llda/models/iteration-{}/fold-{}'.format(i+1, n+1))
            file_path = 'llda/models/iteration-{}/fold-{}/'.format(i+1, n+1)

            training_folds_j = folds_i[list(j)]
            validation_fold_j = folds_i[[f for f in range(0, cv_folds) if f not in j]]
            assert len(validation_fold_j) == 1

            training_fold = reduce(
                lambda x, y: pd.concat([x, y], ignore_index=True, copy=True),
                training_folds_j[1:],
                training_folds_j[0]
            )
            validation_fold = validation_fold_j[0]

            # shuffle the folds
            if balanced:
                training_fold = training_fold.reindex(rng.permutation(training_fold.index))
                training_fold = prep.reduce_label_bias(training_fold, labels, 'activation', 5, random_state=rng)
            training_fold = training_fold.reindex(rng.permutation(training_fold.index))
            validation_fold = validation_fold.reindex(rng.permutation(validation_fold.index))

            write_tsv(training_fold, file_path + '/train.tsv', test=False)
            write_tsv(validation_fold, file_path + '/valid.tsv', test=True)
            write_tsv(test, file_path + '/test.tsv', test=True)

            # ------------------ CALL JAVA TO LLDA ON THIS PARTAY ----------------- #
            DEVNULL = open(os.devnull, 'w')
            args = [
                'java',
                '-jar',
                '-Xmx2048m',
                'llda/tmt-0.4.0.jar',
                'llda/llda.scala',
                file_path,
                '/train.tsv',
                '/valid.tsv',
                '/test.tsv',
                'model-{}-{}'.format(i+1, n+1),
                '{}-{}'.format(i+1, n+1)
            ]
            p = Popen(args, stdout=DEVNULL, stderr=STDOUT)
            p.wait()

            # Perform evaluation
            validation_proba = np.genfromtxt('llda/results/validation-{}-{}.csv'.format(i+1, n+1), delimiter=',')[:, 1:]
            test_proba = np.genfromtxt('llda/results/test-{}-{}.csv'.format(i+1, n+1), delimiter=',')[:, 1:]
            labels_j = get_labels_from_model(file_path + '/llda-cvb0-model-{}-{}'.format(1+i, n+1))

            validation_stats_i_j = Statistics()
            testing_stats_i_j = Statistics()
            for l_index, l in enumerate(labels_j):
                y_validation = [p for p in validation_fold[l].values]
                y_proba_validation = [[1-p, p] for p in validation_proba[:, l_index]]
                y_pred_validation = [int(p >= threshold) for p in validation_proba[:, l_index]]

                y_hprd = [p for p in test[l].values]
                y_proba_hprd = [[1-p, p] for p in test_proba[:, l_index]]
                y_pred_hprd = [int(p >= threshold) for p in test_proba[:, l_index]]

                validation_stats_i_j = evaluate_model(
                    y=y_validation,
                    y_pred=y_pred_validation,
                    y_pred_prob=y_proba_validation,
                    label=l,
                    statistics=validation_stats_i_j,
                    verbose=0
                )

                testing_stats_i_j = evaluate_model(
                    y=y_hprd,
                    y_pred=y_pred_hprd,
                    y_pred_prob=y_proba_hprd,
                    label=l,
                    statistics=testing_stats_i_j,
                    verbose=0
                )
            validation_stats_i.merge(validation_stats_i_j)
            testing_stats_i.merge(testing_stats_i_j)

        return validation_stats_i, testing_stats_i
Exemplo n.º 6
0
                    y_pred_prob=y_proba_hprd,
                    label=l,
                    statistics=testing_stats_i_j,
                    verbose=0
                )
            validation_stats_i.merge(validation_stats_i_j)
            testing_stats_i.merge(testing_stats_i_j)

        return validation_stats_i, testing_stats_i

    containers = parallel_map(do_iteration, zip(range(iterations), seeds), n_jobs=n_jobs)
    valid_containers = [containers[i][0] for i in range(iterations)]
    test_containers = [containers[i][1] for i in range(iterations)]

    for container in valid_containers:
        validation_stats.merge(container)

    for container in test_containers:
        testing_stats.merge(container)

    # --------------------- FINAL RESULTS ---------------------------- #
    direc = tempfile.mkdtemp(prefix='LLDA-{}-'.format(date), dir='results/')
    pickle.dump((validation_stats, testing_stats, config), open(direc + '/LLDA-statistics.pkl', 'w'))
    results = open(direc + '/LLDA-results.txt', 'w')

    results.write("\nRun Settings: \n")
    results.write("\tDate: \t\t\t\t{0}\n".format(date))
    results.write("\tMethod: \t\t\t{0}\n".format('L-LDA'))
    results.write("\tBinary: \t\t\t{0}\n".format('NA'))
    results.write("\tBalanced: \t\t\t{0}\n".format(balanced))
    results.write("\tChained: \t\t\t{0}\n".format('NA'))