示例#1
0
def run_trials(num_trials, dataset, selection_strategy, metric, C, alpha, smoothing, \
                bootstrap_size, balance, coverage, disagree_strat, budget, fmtype, rmw_n, rmw_a, seed=0, Debug=False, \
                reasoning_strategy='random', switch=40):

    (X_pool, y_pool, X_test, y_test, feat_names) = load_dataset(dataset)

    if not feat_names:
        feat_names = np.arange(X_pool.shape[1])

    feat_freq = np.diff(X_pool.tocsc().indptr)

    fe = feature_expert(X_pool,
                        y_pool,
                        metric,
                        smoothing=1e-6,
                        C=C,
                        pick_only_top=True)
    result = np.ndarray(num_trials, dtype=object)

    for i in range(num_trials):
        print '-' * 50
        print 'Starting Trial %d of %d...' % (i + 1, num_trials)

        trial_seed = seed + i  # initialize the seed for the trial

        instance_model = MultinomialNB(alpha=alpha)

        feature_model = None
        if fmtype == "fm_uniform":
            feature_model = FeatureMNBUniform([], [], fe.num_features,
                                              smoothing)
        elif fmtype == "fm_weighted":
            feature_model = FeatureMNBWeighted(num_feat=fe.num_features,
                                               imaginary_counts=1.)
        else:
            raise ValueError('Feature model type: \'%s\' invalid!' % fmtype)

        pooling_model = PoolingMNB()

        reasoning_model = ReasoningMNB(alpha=1)

        if bootstrap_size == 0:
            training_set = []
            pool_set = range(X_pool.shape[0])
        else:
            training_set, pool_set = RandomBootstrap(X_pool, y_pool,
                                                     bootstrap_size, balance,
                                                     trial_seed)

        result[i] = learn(X_pool, y_pool, X_test, y_test, training_set, pool_set, \
            fe, selection_strategy, disagree_strat, coverage, budget, instance_model, \
            feature_model, pooling_model, reasoning_model, rmw_n, rmw_a, trial_seed, Debug, \
            reasoning_strategy, switch)

    return result, feat_names, feat_freq
示例#2
0
def full_knowledge(dataset, metric='mutual_info', C=0.1, alpha=1, smoothing=0):
    '''
    This function uses the entire pool to learn the instance model,
    feature model, and pooling model which provides an upper bound on how well
    these models can perform on this particular dataset
    '''
    (X_pool, y_pool, X_test, y_test, feat_names) = load_dataset(dataset)
    fe = feature_expert(X_pool, y_pool, metric, smoothing=1e-6, C=C)

    instance_model_scores = {'auc': [], 'accu': []}
    feature_model_scores = {'auc': [], 'accu': []}
    pooling_model_scores = {'auc': [], 'accu': []}

    instance_model = MultinomialNB(alpha=alpha)
    feature_model = FeatureMNBUniform([], [], fe.num_features, smoothing)
    pooling_model = PoolingMNB()

    instance_model.fit(X_pool, y_pool)
    for doc in range(X_pool.shape[0]):
        feature = fe.most_informative_feature(X_pool[doc], y_pool[doc])
        feature_model.fit(feature, y_pool[doc])

    # Evaluate performance based on Instance Model
    (accu, auc) = evaluate_model(instance_model, X_test, y_test)
    instance_model_scores['auc'].append(auc)
    instance_model_scores['accu'].append(accu)
    print 'Instance Model: auc = %f, accu = %f' % (auc, accu)

    # Evaluate performance on Feature Model
    (accu, auc) = evaluate_model(feature_model, X_test, y_test)
    feature_model_scores['auc'].append(auc)
    feature_model_scores['accu'].append(accu)
    print 'Feature Model: auc = %f, accu = %f' % (auc, accu)

    # Evaluate performance on Pooled Model
    pooling_model.fit(instance_model, feature_model, weights=[0.5, 0.5])
    (accu, auc) = evaluate_model(pooling_model, X_test, y_test)
    pooling_model_scores['auc'].append(auc)
    pooling_model_scores['accu'].append(accu)
    print 'Pooled Model: auc = %f, accu = %f' % (auc, accu)
示例#3
0
def full_knowledge(dataset, metric='mutual_info', C=0.1, alpha=1, smoothing=0):
    '''
    This function uses the entire pool to learn the instance model,
    feature model, and pooling model which provides an upper bound on how well
    these models can perform on this particular dataset
    '''
    (X_pool,  y_pool, X_test, y_test, feat_names) = load_dataset(dataset)
    fe = feature_expert(X_pool, y_pool, metric, smoothing=1e-6, C=C)
    
    instance_model_scores = {'auc':[], 'accu':[]}
    feature_model_scores = {'auc':[], 'accu':[]}
    pooling_model_scores = {'auc':[], 'accu':[]}
    
    instance_model = MultinomialNB(alpha=alpha)
    feature_model = FeatureMNBUniform([], [], fe.num_features, smoothing)
    pooling_model = PoolingMNB()

    instance_model.fit(X_pool, y_pool)
    for doc in range(X_pool.shape[0]):
        feature = fe.most_informative_feature(X_pool[doc], y_pool[doc])
        feature_model.fit(feature, y_pool[doc])

    # Evaluate performance based on Instance Model
    (accu, auc) = evaluate_model(instance_model, X_test, y_test)
    instance_model_scores['auc'].append(auc)
    instance_model_scores['accu'].append(accu)
    print 'Instance Model: auc = %f, accu = %f' % (auc, accu)
    
    # Evaluate performance on Feature Model
    (accu, auc) = evaluate_model(feature_model, X_test, y_test)
    feature_model_scores['auc'].append(auc)
    feature_model_scores['accu'].append(accu)
    print 'Feature Model: auc = %f, accu = %f' % (auc, accu)
    
    # Evaluate performance on Pooled Model
    pooling_model.fit(instance_model, feature_model, weights=[0.5, 0.5])
    (accu, auc) = evaluate_model(pooling_model, X_test, y_test)
    pooling_model_scores['auc'].append(auc)
    pooling_model_scores['accu'].append(accu)
    print 'Pooled Model: auc = %f, accu = %f' % (auc, accu)
示例#4
0
def learn(model_type, X_pool, y_pool, X_test, y_test, training_set, pool_set, feature_expert, \
          selection_strategy, budget, step_size, topk, w_o, w_r, seed=0, alpha=1, smoothing=0, poolingMNBWeights=[0.5, 0.5], poolingFM_r=100.0, lr_C=1, svm_C=1, \
          zaidan_C=1, zaidan_Ccontrast=1, zaidan_nu=1, cvTrain=False, Debug=False):

    start = time()
    print '-' * 50
    print 'Starting Active Learning...'

    _, num_feat = X_pool.shape
    model_scores = {
        'auc': [],
        'accu': [],
        'wr': [],
        'wo': [],
        'alpha': [],
        'svm_C': [],
        'zaidan_C': [],
        'zaidan_Ccontrast': [],
        'zaidan_nu': [],
        'FMrvalue': [],
        'IMweight': [],
        'FMweight': []
    }

    rationales = set()
    rationales_c0 = set()
    rationales_c1 = set()

    number_of_docs = 0

    feature_expert.rg.seed(seed)

    num_training_samples = []

    all_features = []

    # keep all the training data instance ids in docs list

    docs = training_set

    X_train = None
    y_train = []
    sample_weight = []

    for doc_id in docs:
        #feature = feature_expert.most_informative_feature(X_pool[doc_id], y_pool[doc_id])
        feature = feature_expert.any_informative_feature(
            X_pool[doc_id], y_pool[doc_id])

        number_of_docs = number_of_docs + 1

        # append feature to all_features, even if it is None
        all_features.append(feature)

        if feature is not None:
            rationales.add(feature)

            if y_pool[doc_id] == 0:
                rationales_c0.add(feature)
            else:
                rationales_c1.add(feature)

    if cvTrain:
        # get optimal parameters depending on the model_type

        if model_type == 'mnb_LwoR':
            w_r, w_o = optimalMNBLwoRParameters(X_pool[training_set],
                                                y_pool[training_set],
                                                all_features)

            feature_counter = 0
            for doc_id in docs:
                x = sp.csr_matrix(X_pool[doc_id], dtype=float)

                x_feats = x[0].indices
                for f in x_feats:
                    if f == all_features[feature_counter]:
                        x[0, f] = w_r * x[0, f]
                    else:
                        x[0, f] = w_o * x[0, f]
                feature_counter = feature_counter + 1
                if not y_train:
                    X_train = x
                else:
                    X_train = sp.vstack((X_train, x))

                y_train.append(y_pool[doc_id])

        elif model_type == 'mnb':
            w_r, w_o = optimalMNBParameters(X_pool[training_set],
                                            y_pool[training_set], all_features)

            feature_counter = 0
            for doc_id in docs:
                x = sp.csr_matrix(X_pool[doc_id], dtype=float)

                x_feats = x[0].indices
                for f in x_feats:
                    if f == all_features[feature_counter]:
                        x[0, f] = w_r * x[0, f]
                    else:
                        x[0, f] = w_o * x[0, f]
                feature_counter = feature_counter + 1
                if not y_train:
                    X_train = x
                else:
                    X_train = sp.vstack((X_train, x))

                y_train.append(y_pool[doc_id])

        elif model_type == 'svm_linear':
            w_r, w_o, C = optimalSVMParameters(X_pool[training_set],
                                               y_pool[training_set],
                                               all_features, seed)

            feature_counter = 0
            for doc_id in docs:
                x = sp.csr_matrix(X_pool[doc_id], dtype=float)

                x_feats = x[0].indices
                for f in x_feats:
                    if f == all_features[feature_counter]:
                        x[0, f] = w_r * x[0, f]
                    else:
                        x[0, f] = w_o * x[0, f]
                feature_counter = feature_counter + 1
                if not y_train:
                    X_train = x
                else:
                    X_train = sp.vstack((X_train, x))

                y_train.append(y_pool[doc_id])

        elif model_type == 'svm_linear_LwoR':

            w_r, w_o, C = optimalSVMLwoRParameters(X_pool[training_set],
                                                   y_pool[training_set],
                                                   all_features, seed)
            feature_counter = 0
            for doc_id in docs:
                x = sp.csr_matrix(X_pool[doc_id], dtype=float)

                x_feats = x[0].indices
                for f in x_feats:
                    if f == all_features[feature_counter]:
                        x[0, f] = w_r * x[0, f]
                    else:
                        x[0, f] = w_o * x[0, f]
                feature_counter = feature_counter + 1
                if not y_train:
                    X_train = x
                else:
                    X_train = sp.vstack((X_train, x))

                y_train.append(y_pool[doc_id])

        if model_type == 'poolingMNB':
            classpriors = np.zeros(2)
            classpriors[1] = (np.sum(y_pool[docs]) * 1.) / (len(docs) * 1.)
            classpriors[0] = 1. - classpriors[1]

            alpha, poolingFM_r, poolingMNBWeights = optimalPoolingMNBParameters(
                X_pool[training_set], y_pool[training_set], all_features,
                smoothing, num_feat)

            feature_model = FeatureMNBUniform(rationales_c0, rationales_c1,
                                              num_feat, smoothing, classpriors,
                                              poolingFM_r)

            feature_counter = 0
            for doc_id in docs:
                if all_features[feature_counter]:
                    # updates feature model with features one at a time
                    feature_model.fit(all_features[feature_counter],
                                      y_pool[doc_id])
                feature_counter = feature_counter + 1

                x = sp.csr_matrix(X_pool[doc_id], dtype=float)

                if not y_train:
                    X_train = x
                else:
                    X_train = sp.vstack((X_train, x))

                y_train.append(y_pool[doc_id])

        if model_type == 'Zaidan':

            zaidan_C, zaidan_Ccontrast, zaidan_nu = optimalZaidanParameters(
                X_pool[training_set], y_pool[training_set], all_features, seed)

            feature_counter = 0

            for doc_id in docs:
                x = sp.csr_matrix(X_pool[doc_id], dtype=np.float64)
                if all_features[feature_counter] is not None:
                    x_pseudo = (X_pool[doc_id]).todense()

                    # create pseudoinstances based on rationales provided; one pseudoinstance is created for each rationale.
                    x_feats = x[0].indices

                    for f in x_feats:
                        if f == all_features[feature_counter]:
                            test = x[0, f]
                            x_pseudo[0, f] = x[0, f] / zaidan_nu
                        else:
                            x_pseudo[0, f] = 0.0
                    x_pseudo = sp.csr_matrix(x_pseudo, dtype=np.float64)

                if not y_train:
                    X_train = x
                    if all_features[feature_counter] is not None:
                        X_train = sp.vstack((X_train, x_pseudo))
                else:
                    X_train = sp.vstack((X_train, x))
                    if all_features[feature_counter] is not None:
                        X_train = sp.vstack((X_train, x_pseudo))

                y_train.append(y_pool[doc_id])
                if all_features[feature_counter] is not None:
                    # append y label again for the pseudoinstance created
                    y_train.append(y_pool[doc_id])

                sample_weight.append(zaidan_C)
                if all_features[feature_counter] is not None:
                    # append instance weight=zaidan_Ccontrast for the pseudoinstance created
                    sample_weight.append(zaidan_Ccontrast)

                feature_counter = feature_counter + 1

    # Train the model

    if model_type == 'lrl2':
        random_state = np.random.RandomState(seed=seed)
        model = LogisticRegression(C=lr_C,
                                   penalty='l2',
                                   random_state=random_state)
    elif model_type == 'lrl1':
        random_state = np.random.RandomState(seed=seed)
        model = LogisticRegression(C=lr_C,
                                   penalty='l1',
                                   random_state=random_state)
    elif model_type == 'mnb':
        model = MultinomialNB(alpha=alpha)
    elif model_type == 'mnb_LwoR':
        model = MultinomialNB(alpha=alpha)
    elif model_type == 'svm_linear_LwoR':
        random_state = np.random.RandomState(seed=seed)
        model = LinearSVC(C=C, random_state=random_state)
    elif model_type == 'svm_linear':
        random_state = np.random.RandomState(seed=seed)
        model = LinearSVC(C=C, random_state=random_state)
    elif model_type == 'poolingMNB':
        instance_model = MultinomialNB(alpha=alpha)
        model = PoolingMNB()
    elif model_type == 'Zaidan':
        random_state = np.random.RandomState(seed=seed)
        model = svm.SVC(kernel='linear', C=1.0, random_state=random_state)

    if model_type == 'poolingMNB':
        instance_model.fit(X_train, y_train)
        model.fit(instance_model, feature_model,
                  weights=poolingMNBWeights)  # train pooling_model
    elif model_type == 'Zaidan':
        model.fit(X_train, np.array(y_train), sample_weight=sample_weight)
    else:
        model.fit(X_train, np.array(y_train))

    (accu, auc) = evaluate_model(model, X_test, y_test)
    model_scores['auc'].append(auc)
    model_scores['accu'].append(accu)
    if model_type == 'poolingMNB':
        model_scores['alpha'].append(alpha)
        model_scores['FMrvalue'].append(poolingFM_r)
        model_scores['IMweight'].append(poolingMNBWeights[0])
        model_scores['FMweight'].append(poolingMNBWeights[1])
    else:
        model_scores['FMrvalue'].append(0.0)
        model_scores['IMweight'].append(0.0)
        model_scores['FMweight'].append(0.0)

    if model_type == 'Zaidan':
        model_scores['zaidan_C'].append(zaidan_C)
        model_scores['zaidan_Ccontrast'].append(zaidan_Ccontrast)
        model_scores['zaidan_nu'].append(zaidan_nu)
    else:
        model_scores['zaidan_C'].append(0.0)
        model_scores['zaidan_Ccontrast'].append(0.0)
        model_scores['zaidan_nu'].append(0.0)

    if model_type == 'mnb' or model_type == 'mnb_LwoR':
        model_scores['alpha'].append(alpha)
    else:
        model_scores['alpha'].append(0.0)

    if model_type == 'svm_linear' or model_type == 'svm_linear_LwoR':
        model_scores['svm_C'].append(C)
    else:
        model_scores['svm_C'].append(0.0)

    if model_type == 'svm_linear' or model_type == 'svm_linear_LwoR' or model_type == 'mnb' or model_type == 'mnb_LwoR':
        model_scores['wr'].append(w_r)
        model_scores['wo'].append(w_o)
    else:
        model_scores['wr'].append(0.0)
        model_scores['wo'].append(0.0)

    num_training_samples.append(number_of_docs)

    feature_expert.rg.seed(seed)

    if selection_strategy == 'random':
        doc_pick_model = RandomStrategy(seed)
    elif selection_strategy == 'unc':
        doc_pick_model = UNCSampling()
    elif selection_strategy == "pnc":
        doc_pick_model = UNCPreferNoConflict()
    elif selection_strategy == "pnr":
        doc_pick_model = UNCPreferNoRationale()
    elif selection_strategy == "pr":
        doc_pick_model = UNCPreferRationale()
    elif selection_strategy == "pc":
        doc_pick_model = UNCPreferConflict()
    elif selection_strategy == "tt":
        doc_pick_model = UNCThreeTypes()
    elif selection_strategy == "pipe":
        doc_pick_model = Pipe([UNCSampling(), UNCPreferConflict()], [10, 30])
    else:
        raise ValueError('Selection strategy: \'%s\' invalid!' %
                         selection_strategy)

    k = step_size

    #while X_train.shape[0] < budget:
    while number_of_docs < budget:

        # Choose a document based on the strategy chosen
        if selection_strategy == "pnc":
            doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k,
                                             rationales_c0, rationales_c1,
                                             topk)
        elif selection_strategy == "pnr":
            doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k,
                                             rationales_c0, rationales_c1,
                                             topk)
        elif selection_strategy == "pr":
            doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k,
                                             rationales_c0, rationales_c1,
                                             topk)
        elif selection_strategy == "pc":
            doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k,
                                             rationales_c0, rationales_c1,
                                             topk)
        elif selection_strategy == "tt":
            doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k,
                                             rationales_c0, rationales_c1,
                                             topk)
        elif selection_strategy == "pipe":
            doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k,
                                             rationales_c0, rationales_c1,
                                             topk)
        else:
            doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k)

        if doc_ids is None or len(doc_ids) == 0:
            break

        for doc_id in doc_ids:
            #feature = feature_expert.most_informative_feature(X_pool[doc_id], y_pool[doc_id])
            feature = feature_expert.any_informative_feature(
                X_pool[doc_id], y_pool[doc_id])

            all_features.append(feature)

            number_of_docs = number_of_docs + 1

            if feature is not None:
                rationales.add(feature)

                if y_pool[doc_id] == 0:
                    rationales_c0.add(feature)
                else:
                    rationales_c1.add(feature)

            # Remove the chosen document from pool and add it to the training set
            pool_set.remove(doc_id)
            training_set.append(long(doc_id))

        if cvTrain:
            # get optimal parameters depending on the model_type

            X_train = None
            y_train = []
            sample_weight = []

            if model_type == 'mnb_LwoR':
                if np.mod(number_of_docs, 20) == 10:
                    w_r, w_o = optimalMNBLwoRParameters(
                        X_pool[training_set], y_pool[training_set],
                        all_features)

                feature_counter = 0
                for doc_id in training_set:
                    x = sp.csr_matrix(X_pool[doc_id], dtype=float)

                    x_feats = x[0].indices
                    for f in x_feats:
                        if f == all_features[feature_counter]:
                            x[0, f] = w_r * x[0, f]
                        else:
                            x[0, f] = w_o * x[0, f]
                    feature_counter = feature_counter + 1
                    if not y_train:
                        X_train = x
                    else:
                        X_train = sp.vstack((X_train, x))

                    y_train.append(y_pool[doc_id])

            elif model_type == 'mnb':
                if np.mod(number_of_docs, 20) == 10:
                    w_r, w_o = optimalMNBParameters(X_pool[training_set],
                                                    y_pool[training_set],
                                                    all_features)

                feature_counter = 0
                for doc_id in training_set:
                    x = sp.csr_matrix(X_pool[doc_id], dtype=float)

                    x_feats = x[0].indices
                    for f in x_feats:
                        if f == all_features[feature_counter]:
                            x[0, f] = w_r * x[0, f]
                        else:
                            x[0, f] = w_o * x[0, f]
                    feature_counter = feature_counter + 1
                    if not y_train:
                        X_train = x
                    else:
                        X_train = sp.vstack((X_train, x))

                    y_train.append(y_pool[doc_id])

            elif model_type == 'svm_linear_LwoR':
                if np.mod(number_of_docs, 20) == 10:
                    w_r, w_o, C = optimalSVMLwoRParameters(
                        X_pool[training_set], y_pool[training_set],
                        all_features, seed)

                feature_counter = 0
                for doc_id in training_set:
                    x = sp.csr_matrix(X_pool[doc_id], dtype=np.float64)

                    x_feats = x[0].indices
                    for f in x_feats:
                        if f == all_features[feature_counter]:
                            x[0, f] = w_r * x[0, f]
                        else:
                            x[0, f] = w_o * x[0, f]

                    feature_counter = feature_counter + 1

                    if not y_train:
                        X_train = x
                    else:
                        X_train = sp.vstack((X_train, x))

                    y_train.append(y_pool[doc_id])

            elif model_type == 'svm_linear':
                if np.mod(number_of_docs, 20) == 10:
                    w_r, w_o, C = optimalSVMParameters(X_pool[training_set],
                                                       y_pool[training_set],
                                                       all_features, seed)

                feature_counter = 0
                for doc_id in training_set:
                    x = sp.csr_matrix(X_pool[doc_id], dtype=np.float64)

                    x_feats = x[0].indices
                    for f in x_feats:
                        if f == all_features[feature_counter]:
                            x[0, f] = w_r * x[0, f]
                        else:
                            x[0, f] = w_o * x[0, f]

                    feature_counter = feature_counter + 1

                    if not y_train:
                        X_train = x
                    else:
                        X_train = sp.vstack((X_train, x))

                    y_train.append(y_pool[doc_id])

            if model_type == 'poolingMNB':
                classpriors = np.zeros(2)
                classpriors[1] = (np.sum(y_pool[docs]) * 1.) / (len(docs) * 1.)
                classpriors[0] = 1. - classpriors[1]

                if np.mod(number_of_docs, 20) == 10:
                    alpha, poolingFM_r, poolingMNBWeights = optimalPoolingMNBParameters(
                        X_pool[training_set], y_pool[training_set],
                        all_features, smoothing, num_feat)

                feature_model = FeatureMNBUniform(rationales_c0, rationales_c1,
                                                  num_feat, smoothing,
                                                  classpriors, poolingFM_r)

                feature_counter = 0
                for doc_id in training_set:
                    if all_features[feature_counter]:
                        # updates feature model with features one at a time
                        feature_model.fit(all_features[feature_counter],
                                          y_pool[doc_id])
                    feature_counter = feature_counter + 1

                    x = sp.csr_matrix(X_pool[doc_id], dtype=float)

                    if not y_train:
                        X_train = x
                    else:
                        X_train = sp.vstack((X_train, x))

                    y_train.append(y_pool[doc_id])

            if model_type == 'Zaidan':

                if np.mod(number_of_docs, 20) == 10:
                    zaidan_C, zaidan_Ccontrast, zaidan_nu = optimalZaidanParameters(
                        X_pool[training_set], y_pool[training_set],
                        all_features, seed)

                feature_counter = 0

                for doc_id in training_set:
                    x = sp.csr_matrix(X_pool[doc_id], dtype=np.float64)

                    if all_features[feature_counter] is not None:
                        x_pseudo = (X_pool[doc_id]).todense()

                        # create pseudoinstances based on rationales provided; one pseudoinstance is created for each rationale.
                        x_feats = x[0].indices

                        for f in x_feats:
                            if f == all_features[feature_counter]:
                                test = x[0, f]
                                x_pseudo[0, f] = x[0, f] / zaidan_nu
                            else:
                                x_pseudo[0, f] = 0.0

                        x_pseudo = sp.csr_matrix(x_pseudo, dtype=np.float64)

                    if not y_train:
                        X_train = x
                        if all_features[feature_counter] is not None:
                            X_train = sp.vstack((X_train, x_pseudo))
                    else:
                        X_train = sp.vstack((X_train, x))
                        if all_features[feature_counter] is not None:
                            X_train = sp.vstack((X_train, x_pseudo))

                    y_train.append(y_pool[doc_id])
                    if all_features[feature_counter] is not None:
                        # append y label again for the pseudoinstance created
                        y_train.append(y_pool[doc_id])

                    sample_weight.append(zaidan_C)
                    if all_features[feature_counter] is not None:
                        # append instance weight=zaidan_Ccontrast for the pseudoinstance created
                        sample_weight.append(zaidan_Ccontrast)

                    feature_counter = feature_counter + 1

        # Train the model

        if model_type == 'lrl2':
            random_state = np.random.RandomState(seed=seed)
            model = LogisticRegression(C=lr_C,
                                       penalty='l2',
                                       random_state=random_state)
        elif model_type == 'lrl1':
            random_state = np.random.RandomState(seed=seed)
            model = LogisticRegression(C=lr_C,
                                       penalty='l1',
                                       random_state=random_state)
        elif model_type == 'mnb':
            model = MultinomialNB(alpha=alpha)
        elif model_type == 'mnb_LwoR':
            model = MultinomialNB(alpha=alpha)
        elif model_type == 'svm_linear_LwoR':
            random_state = np.random.RandomState(seed=seed)
            model = LinearSVC(C=C, random_state=random_state)
        elif model_type == 'svm_linear':
            random_state = np.random.RandomState(seed=seed)
            model = LinearSVC(C=C, random_state=random_state)
        elif model_type == 'poolingMNB':
            instance_model = MultinomialNB(alpha=alpha)
            model = PoolingMNB()
        elif model_type == 'Zaidan':
            random_state = np.random.RandomState(seed=seed)
            model = svm.SVC(kernel='linear',
                            C=svm_C,
                            random_state=random_state)

        if model_type == 'poolingMNB':
            instance_model.fit(X_train, y_train)
            model.fit(instance_model, feature_model,
                      weights=poolingMNBWeights)  # train pooling_model
        elif model_type == 'Zaidan':
            model.fit(X_train, np.array(y_train), sample_weight=sample_weight)
        else:
            model.fit(X_train, np.array(y_train))

        (accu, auc) = evaluate_model(model, X_test, y_test)
        model_scores['auc'].append(auc)
        model_scores['accu'].append(accu)
        if model_type == 'poolingMNB':
            model_scores['alpha'].append(alpha)
            model_scores['FMrvalue'].append(poolingFM_r)
            model_scores['IMweight'].append(poolingMNBWeights[0])
            model_scores['FMweight'].append(poolingMNBWeights[1])
        else:
            model_scores['FMrvalue'].append(0.0)
            model_scores['IMweight'].append(0.0)
            model_scores['FMweight'].append(0.0)

        if model_type == 'Zaidan':
            model_scores['zaidan_C'].append(zaidan_C)
            model_scores['zaidan_Ccontrast'].append(zaidan_Ccontrast)
            model_scores['zaidan_nu'].append(zaidan_nu)
        else:
            model_scores['zaidan_C'].append(0.0)
            model_scores['zaidan_Ccontrast'].append(0.0)
            model_scores['zaidan_nu'].append(0.0)

        if model_type == 'mnb' or model_type == 'mnb_LwoR':
            model_scores['alpha'].append(alpha)
        else:
            model_scores['alpha'].append(0.0)

        if model_type == 'svm_linear' or model_type == 'svm_linear_LwoR':
            model_scores['svm_C'].append(C)
        else:
            model_scores['svm_C'].append(0.0)

        if model_type == 'svm_linear' or model_type == 'svm_linear_LwoR' or model_type == 'mnb' or model_type == 'mnb_LwoR':
            model_scores['wr'].append(w_r)
            model_scores['wo'].append(w_o)
        else:
            model_scores['wr'].append(0.0)
            model_scores['wo'].append(0.0)

        num_training_samples.append(number_of_docs)

    print 'Active Learning took %2.2fs' % (time() - start)

    return (np.array(num_training_samples), model_scores)
def learn(model_type, X_pool, y_pool, X_test, y_test, training_set, pool_set, feature_expert, \
          selection_strategy, budget, step_size, topk, w_o, w_r, seed=0, alpha=1, poolingMNBWeights=[0.5, 0.5], Meville_etal_r=100.0, lr_C=1, svm_C=1, \
          Zaidan_etal_C=1, Zaidan_etal_Ccontrast=1, Zaidan_etal_mu=1, Debug=False):

    start = time()
    print('-' * 50)
    print('Starting Active Learning...')

    _, num_feat = X_pool.shape
    model_scores = {'auc': [], 'accu': []}

    rationales = set()
    rationales_c0 = set()
    rationales_c1 = set()

    feature_expert.rg.seed(seed)

    num_training_samples = []

    number_of_docs = 0

    docs = training_set

    X_train = None
    y_train = []

    sample_weight = list()

    if model_type == 'Melville_etal':
        # create feature model
        classpriors = np.zeros(2)
        classpriors[1] = (np.sum(y_pool[docs]) * 1.) / (len(docs) * 1.)
        classpriors[0] = 1. - classpriors[1]

        feature_model = FeatureMNBUniform(rationales_c0, rationales_c1,
                                          num_feat, classpriors,
                                          Meville_etal_r)

    for doc_id in docs:

        number_of_docs = number_of_docs + 1

        feature = feature_expert.any_informative_feature(
            X_pool[doc_id], y_pool[doc_id])

        if model_type == 'Melville_etal':
            if feature:
                feature_model.fit(feature, y_pool[doc_id])

        rationales.add(feature)

        if y_pool[doc_id] == 0:
            rationales_c0.add(feature)
        else:
            rationales_c1.add(feature)

        if model_type == 'Zaidan_etal':
            x = sp.csr_matrix(X_pool[doc_id], dtype=np.float64)
            if feature is not None:
                x_pseudo = (X_pool[doc_id]).todense()

                # create pseudoinstances based on rationales provided; one pseudoinstance is created per rationale.
                x_feats = x[0].indices

                for f in x_feats:
                    if f == feature:
                        test = x[0, f]
                        x_pseudo[0, f] = x[0, f] / Zaidan_etal_mu
                    else:
                        x_pseudo[0, f] = 0.0
                x_pseudo = sp.csr_matrix(x_pseudo, dtype=np.float64)

        else:
            x = sp.csr_matrix(X_pool[doc_id], dtype=float)
            if "Melville_etal" not in model_type:
                x_feats = x[0].indices
                for f in x_feats:
                    if f == feature:
                        x[0, f] = w_r * x[0, f]
                    else:
                        x[0, f] = w_o * x[0, f]

        if model_type == 'Zaidan_etal':
            if not y_train:
                X_train = x
                if feature is not None:
                    X_train = sp.vstack((X_train, x_pseudo))
            else:
                X_train = sp.vstack((X_train, x))
                if feature is not None:
                    X_train = sp.vstack((X_train, x_pseudo))

            y_train.append(y_pool[doc_id])
            if feature is not None:
                # append y label again for the pseudoinstance created
                y_train.append(y_pool[doc_id])

            sample_weight.append(Zaidan_etal_C)
            if feature is not None:
                # append instance weight=Zaidan_etal_Ccontrast for the pseudoinstance created
                sample_weight.append(Zaidan_etal_Ccontrast)

        else:
            if not y_train:
                X_train = x
            else:
                X_train = sp.vstack((X_train, x))

            y_train.append(y_pool[doc_id])

    # Train the model

    if model_type == 'lrl2':
        random_state = RandomState(seed=seed)
        model = LogisticRegression(C=lr_C,
                                   penalty='l2',
                                   random_state=random_state)
    elif model_type == 'lrl1':
        random_state = RandomState(seed=seed)
        model = LogisticRegression(C=lr_C,
                                   penalty='l1',
                                   random_state=random_state)
    elif model_type == 'mnb':
        model = MultinomialNB(alpha=alpha)
    elif model_type == 'svm_linear':
        random_state = RandomState(seed=seed)
        model = LinearSVC(C=svm_C, random_state=random_state)
    elif model_type == 'Melville_etal':
        instance_model = MultinomialNB(alpha=alpha)
        model = PoolingMNB()
    elif model_type == 'Zaidan_etal':
        random_state = RandomState(seed=seed)
        model = svm.SVC(kernel='linear', C=svm_C, random_state=random_state)

    if model_type == 'Melville_etal':
        #feature_model.fit(feature, y_pool[doc_id])
        instance_model.fit(X_train, y_train)
        model.fit(instance_model, feature_model,
                  weights=poolingMNBWeights)  # train pooling_model
    elif model_type == 'Zaidan_etal':
        model.fit(X_train, np.array(y_train), sample_weight=sample_weight)
    else:
        model.fit(X_train, np.array(y_train))

    (accu, auc) = evaluate_model(model, X_test, y_test)
    model_scores['auc'].append(auc)
    model_scores['accu'].append(accu)

    num_training_samples.append(number_of_docs)

    feature_expert.rg.seed(seed)

    if selection_strategy == 'RND':
        doc_pick_model = RandomStrategy(seed)
    elif selection_strategy == 'UNC':
        doc_pick_model = UNCSampling()
    elif selection_strategy == 'UNC_PNC':
        doc_pick_model = UNCPreferNoConflict()
    elif selection_strategy == 'UNC_PC':
        doc_pick_model = UNCPreferConflict()
    else:
        raise ValueError('Selection strategy: \'%s\' invalid!' %
                         selection_strategy)

    k = step_size

    while X_train.shape[0] < budget:

        # Choose a document based on the strategy chosen
        if selection_strategy == 'UNC_PNC':
            doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k,
                                             rationales_c0, rationales_c1,
                                             topk)
        elif selection_strategy == 'UNC_PC':
            doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k,
                                             rationales_c0, rationales_c1,
                                             topk)
        else:
            doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k)

        if doc_ids is None or len(doc_ids) == 0:
            break

        for doc_id in doc_ids:
            # Remove the chosen document from pool and add it to the training set
            pool_set.remove(doc_id)
            training_set.append(doc_id)

            #feature = feature_expert.most_informative_feature(X_pool[doc_id], y_pool[doc_id])
            feature = feature_expert.any_informative_feature(
                X_pool[doc_id], y_pool[doc_id])

            if model_type == 'Melville_etal':
                if feature:
                    feature_model.fit(feature, y_pool[doc_id])

            number_of_docs = number_of_docs + 1

            rationales.add(feature)

            if y_pool[doc_id] == 0:
                rationales_c0.add(feature)
            else:
                rationales_c1.add(feature)

            if model_type == 'Zaidan_etal':
                x = sp.csr_matrix(X_pool[doc_id], dtype=np.float64)
                if feature is not None:
                    x_pseudo = (X_pool[doc_id]).todense()

                    # create pseudoinstances based on rationales provided; one pseudoinstance is created for each rationale.
                    x_feats = x[0].indices

                    for f in x_feats:
                        if f == feature:
                            test = x[0, f]
                            x_pseudo[0, f] = x[0, f] / Zaidan_etal_mu
                        else:
                            x_pseudo[0, f] = 0.0
                    x_pseudo = sp.csr_matrix(x_pseudo, dtype=np.float64)

            else:
                x = sp.csr_matrix(X_pool[doc_id], dtype=float)
                if "Melville_etal" not in model_type:
                    x_feats = x[0].indices
                    for f in x_feats:
                        if f == feature:
                            x[0, f] = w_r * x[0, f]
                        else:
                            x[0, f] = w_o * x[0, f]

            if model_type == 'Zaidan_etal':
                X_train = sp.vstack((X_train, x))
                if feature is not None:
                    X_train = sp.vstack((X_train, x_pseudo))

                y_train.append(y_pool[doc_id])
                if feature is not None:
                    # append y label again for the pseudoinstance created
                    y_train.append(y_pool[doc_id])

                sample_weight.append(Zaidan_etal_C)
                if feature is not None:
                    # append instance weight=Zaidan_etal_Ccontrast for the pseudoinstance created
                    sample_weight.append(Zaidan_etal_Ccontrast)

            else:
                X_train = sp.vstack((X_train, x))
                y_train.append(y_pool[doc_id])

        # Train the model

        if model_type == 'lrl2':
            random_state = RandomState(seed=seed)
            model = LogisticRegression(C=lr_C,
                                       penalty='l2',
                                       random_state=random_state)
        elif model_type == 'lrl1':
            random_state = RandomState(seed=seed)
            model = LogisticRegression(C=lr_C,
                                       penalty='l1',
                                       random_state=random_state)
        elif model_type == 'mnb':
            model = MultinomialNB(alpha=alpha)
        elif model_type == 'svm_linear':
            random_state = RandomState(seed=seed)
            model = LinearSVC(C=svm_C, random_state=random_state)
        elif model_type == 'Melville_etal':
            instance_model = MultinomialNB(alpha=alpha)
            model = PoolingMNB()
        elif model_type == 'Zaidan_etal':
            random_state = RandomState(seed=seed)
            model = svm.SVC(kernel='linear',
                            C=svm_C,
                            random_state=random_state)

        if model_type == 'Melville_etal':
            instance_model.fit(X_train, y_train)
            model.fit(instance_model, feature_model,
                      weights=poolingMNBWeights)  # train pooling_model
        elif model_type == 'Zaidan_etal':
            model.fit(X_train, np.array(y_train), sample_weight=sample_weight)
        else:
            model.fit(X_train, np.array(y_train))

        (accu, auc) = evaluate_model(model, X_test, y_test)
        model_scores['auc'].append(auc)
        model_scores['accu'].append(accu)

        num_training_samples.append(number_of_docs)

    print('Active Learning took %2.2fs' % (time() - start))

    return (np.array(num_training_samples), model_scores)
示例#6
0
 top_n = 20
 
 print '\n'
 print '=' * 50
 
 for doc in doc_ids:
     print_all_features(feature_names, fe, top_n, doc, X_pool, y_pool, X_pool_docs)
     
     print '=' * 50
     ch = raw_input('Display the next document? Press Enter to continue or type \'n\' to exit...  ')
     
     if ch == 'n':
         break
 
 t0 = time()
 print '-' * 50
 print 'Starting to train feature_model(MNB)...'
 feature_model = FeatureMNBUniform([], [], num_feat=X_pool.shape[1], smoothing=1e-6, class_prior = [0.5, 0.5], r=100.)
 
 for doc in range(X_pool.shape[0]):
     feature = fe.most_informative_feature(X_pool[doc], y_pool[doc])
     if feature:
         feature_model.fit(feature, y_pool[doc]) # train feature_model one by one
 
 print 'Feature Model(MNB): accu = %f, auc = %f' % evaluate_model(feature_model, X_test, y_test)
 print 'Training feature_model(MNB) took %5.2f' % (time() - t0)
 
 logit = linear_model.LogisticRegression(C=args.c, penalty='l1')
 logit.fit(X_pool, y_pool)
 
 print 'Feature Model(LogisticRegression): accu = %f, auc = %f' % evaluate_model(logit, X_test, y_test)
    def choice(self, X, y, pool, train_indices, current_feature_model,
               current_reasoning_model, w_n, w_a):

        rand_indices = self.rgen.permutation(len(pool))
        candidates = [pool[i] for i in rand_indices[:self.sub_pool]]

        aucs = []

        for doc in candidates:
            new_train_indices = list(train_indices)
            new_train_indices.append(doc)

            # train an instance model
            instance_model = MultinomialNB(alpha=1.)
            instance_model.fit(X[new_train_indices], y[new_train_indices])

            # train a feature model

            feature_model = None
            if isinstance(current_feature_model, FeatureMNBUniform):
                feature_model = FeatureMNBUniform(
                    current_feature_model.class0_feats,
                    current_feature_model.class1_feats,
                    self.feature_expert.num_features, 0)
            elif isinstance(current_feature_model, FeatureMNBWeighted):
                feature_model = FeatureMNBWeighted(
                    num_feat=self.feature_expert.num_features,
                    feat_count=current_feature_model.feature_count_,
                    imaginary_counts=current_feature_model.imaginary_counts)
            else:
                raise ValueError('Feature model type: \'%s\' unknown!' %
                                 current_feature_model.__class__.__name__)

            top_feat = self.feature_expert.most_informative_feature(
                X[doc], y[doc])

            if top_feat:
                feature_model.fit(
                    top_feat, y[doc]
                )  # fit also calls update; so there is no need to update again
            else:
                feature_model.update()

            # make a deep copy of the reasoning model and partial train it

            reasoning_model = copy.deepcopy(current_reasoning_model)
            reasoning_model.partial_fit(X[doc], y[doc], top_feat, w_n, w_a)

            # pooling model

            pooling_model = PoolingMNB()
            pooling_model.fit(instance_model,
                              feature_model,
                              weights=[0.5, 0.5])

            # evaluate

            opt_model = None

            if self.optimize == "P":
                opt_model = pooling_model
            elif self.optimize == "I":
                opt_model = instance_model
            elif self.optimize == "F":
                opt_model = feature_model
            elif self.optimize == "R":
                opt_model = reasoning_model
            else:
                raise ValueError('Optimization Model: \'%s\' invalid!' %
                                 self.optimize)

            y_probas = opt_model.predict_proba(self.X_test)

            auc = metrics.roc_auc_score(self.y_test, y_probas[:, 1])
            aucs.append(auc)

        doc_id = candidates[np.argsort(aucs)[-1]]

        return doc_id
def learn(model_type, X_pool, y_pool, X_test, y_test, training_set, pool_set, feature_expert, \
          selection_strategy, budget, step_size, topk, w_o, w_r, seed=0, alpha=1, poolingMNBWeights=[0.5, 0.5], Meville_etal_r=100.0, lr_C=1, svm_C=1, \
          Zaidan_etal_C=1, Zaidan_etal_Ccontrast=1, Zaidan_etal_mu=1, Debug=False):
    
    start = time()
    print '-' * 50
    print 'Starting Active Learning...'
    
    _, num_feat = X_pool.shape
    model_scores = {'auc':[], 'accu':[]}
    
    rationales  = set()
    rationales_c0  = set()
    rationales_c1  = set()

    feature_expert.rg.seed(seed)
    
    num_training_samples = []
    
    number_of_docs = 0
    
    docs = training_set
    
    X_train = None
    y_train = []
    
    if model_type=='Melville_etal':      
        # create feature model  
        classpriors=np.zeros(2)            
        classpriors[1] = (np.sum(y_pool[docs])*1.)/(len(docs)*1.)
        classpriors[0] = 1. - classpriors[1] 

        feature_model = FeatureMNBUniform(rationales_c0, rationales_c1, num_feat, classpriors, Meville_etal_r)    

    for doc_id in docs:
        
        number_of_docs=number_of_docs+1    
        
        feature = feature_expert.any_informative_feature(X_pool[doc_id], y_pool[doc_id])

        if model_type == 'Melville_etal':        
            if feature:
                feature_model.fit(feature, y_pool[doc_id])
        
        rationales.add(feature)

        if y_pool[doc_id] == 0:
            rationales_c0.add(feature)
        else:
            rationales_c1.add(feature)        
                    

        if model_type == 'Zaidan_etal':
            x = sp.csr_matrix(X_pool[doc_id], dtype=np.float64)
            if feature is not None:
                x_pseudo = (X_pool[doc_id]).todense()
                                
                # create pseudoinstances based on rationales provided; one pseudoinstance is created per rationale.
                x_feats = x[0].indices
        
                for f in x_feats:
                    if f == feature:
                        test= x[0,f]
                        x_pseudo[0,f] = x[0,f]/Zaidan_etal_mu
                    else:                                              
                        x_pseudo[0,f] = 0.0                          
                x_pseudo=sp.csr_matrix(x_pseudo, dtype=np.float64)

        else:
            x = sp.csr_matrix(X_pool[doc_id], dtype=float)
            if "Melville_etal" not in model_type:         
                x_feats = x[0].indices
                for f in x_feats:
                    if f == feature:
                        x[0,f] = w_r*x[0,f]
                    else:
                        x[0,f] = w_o*x[0,f]
        

        if model_type=='Zaidan_etal':
            if not y_train:
                X_train = x      
                if feature is not None:      
                    X_train = sp.vstack((X_train, x_pseudo))
            else:
                X_train = sp.vstack((X_train, x))
                if feature is not None:
                    X_train = sp.vstack((X_train, x_pseudo))
        
            y_train.append(y_pool[doc_id])
            if feature is not None:
                # append y label again for the pseudoinstance created
                y_train.append(y_pool[doc_id])
        

            sample_weight.append(Zaidan_etal_C)
            if feature is not None:
                # append instance weight=Zaidan_etal_Ccontrast for the pseudoinstance created
                sample_weight.append(Zaidan_etal_Ccontrast)  

        else:
            if not y_train:
                X_train = x
            else:
                X_train = sp.vstack((X_train, x))
        
            y_train.append(y_pool[doc_id])
    
    # Train the model
    
    if model_type=='lrl2':
        random_state = np.random.RandomState(seed=seed)
        model = LogisticRegression(C=lr_C, penalty='l2', random_state=random_state)
    elif model_type=='lrl1':
        random_state = np.random.RandomState(seed=seed)
        model = LogisticRegression(C=lr_C, penalty='l1', random_state=random_state)        
    elif model_type=='mnb':        
        model = MultinomialNB(alpha=alpha)        
    elif model_type=='svm_linear':
        random_state = np.random.RandomState(seed=seed)
        model = LinearSVC(C=svm_C, random_state=random_state)
    elif model_type=='Melville_etal':
        instance_model=MultinomialNB(alpha=alpha)        
        model = PoolingMNB()
    elif model_type=='Zaidan_etal':
        random_state = np.random.RandomState(seed=seed)        
        model = svm.SVC(kernel='linear', C=svm_C, random_state=random_state)
        
    if model_type=='Melville_etal':                
        #feature_model.fit(feature, y_pool[doc_id])
        instance_model.fit(X_train, y_train)
        model.fit(instance_model, feature_model, weights=poolingMNBWeights) # train pooling_model
    elif model_type=='Zaidan_etal':
        model.fit(X_train, np.array(y_train), sample_weight=sample_weight)
    else:
        model.fit(X_train, np.array(y_train))
    
    
            
    (accu, auc) = evaluate_model(model, X_test, y_test)
    model_scores['auc'].append(auc)
    model_scores['accu'].append(accu)
    
    num_training_samples.append(number_of_docs)
    
    feature_expert.rg.seed(seed)        
    
    if selection_strategy == 'RND':
        doc_pick_model = RandomStrategy(seed)
    elif selection_strategy == 'UNC':
        doc_pick_model = UNCSampling()         
    elif selection_strategy == 'UNC_PNC':
        doc_pick_model = UNCPreferNoConflict()   
    elif selection_strategy == 'UNC_PC':
        doc_pick_model = UNCPreferConflict()    
    else:
        raise ValueError('Selection strategy: \'%s\' invalid!' % selection_strategy)
 
  
    k = step_size  

    while X_train.shape[0] < budget:                

        # Choose a document based on the strategy chosen
        if selection_strategy == 'UNC_PNC':
            doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k, rationales_c0, rationales_c1, topk)       
        elif selection_strategy == 'UNC_PC':
            doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k, rationales_c0, rationales_c1, topk)        
        else:
            doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k)
        
        if doc_ids is None or len(doc_ids) == 0:
            break        
        
        for doc_id in doc_ids:
            # Remove the chosen document from pool and add it to the training set
            pool_set.remove(doc_id)
            training_set.append(doc_id)

            #feature = feature_expert.most_informative_feature(X_pool[doc_id], y_pool[doc_id])
            feature = feature_expert.any_informative_feature(X_pool[doc_id], y_pool[doc_id])

            if model_type=='Melville_etal':        
                if feature:
                    feature_model.fit(feature, y_pool[doc_id])
            
            number_of_docs=number_of_docs+1    

            rationales.add(feature)

            if y_pool[doc_id] == 0:
                rationales_c0.add(feature)
            else:
                rationales_c1.add(feature)
            

            if model_type=='Zaidan_etal':
                x = sp.csr_matrix(X_pool[doc_id], dtype=np.float64)
                if feature is not None:
                    x_pseudo = (X_pool[doc_id]).todense()
                                
                    # create pseudoinstances based on rationales provided; one pseudoinstance is created for each rationale.
                    x_feats = x[0].indices
        
                    for f in x_feats:
                        if f == feature:
                            test= x[0,f]
                            x_pseudo[0,f] = x[0,f]/Zaidan_etal_mu
                        else:                                              
                            x_pseudo[0,f] = 0.0                          
                    x_pseudo=sp.csr_matrix(x_pseudo, dtype=np.float64)

            else:
                x = sp.csr_matrix(X_pool[doc_id], dtype=float)
                if "Melville_etal" not in model_type:         
                    x_feats = x[0].indices
                    for f in x_feats:
                        if f == feature:
                            x[0,f] = w_r*x[0,f]
                        else:
                            x[0,f] = w_o*x[0,f]                                   

            if model_type=='Zaidan_etal':
                X_train = sp.vstack((X_train, x))
                if feature is not None:
                    X_train = sp.vstack((X_train, x_pseudo))
        
                y_train.append(y_pool[doc_id])
                if feature is not None:
                    # append y label again for the pseudoinstance created
                    y_train.append(y_pool[doc_id])        

                sample_weight.append(Zaidan_etal_C)
                if feature is not None:
                    # append instance weight=Zaidan_etal_Ccontrast for the pseudoinstance created
                    sample_weight.append(Zaidan_etal_Ccontrast)  

            else:
                X_train = sp.vstack((X_train, x))        
                y_train.append(y_pool[doc_id])
        
        # Train the model

        
        if model_type=='lrl2':
            random_state = np.random.RandomState(seed=seed)
            model = LogisticRegression(C=lr_C, penalty='l2', random_state=random_state)
        elif model_type=='lrl1':
            random_state = np.random.RandomState(seed=seed)
            model = LogisticRegression(C=lr_C, penalty='l1', random_state=random_state)        
        elif model_type=='mnb':        
            model = MultinomialNB(alpha=alpha)        
        elif model_type=='svm_linear':
            random_state = np.random.RandomState(seed=seed)
            model = LinearSVC(C=svm_C, random_state=random_state)
        elif model_type=='Melville_etal':
            instance_model=MultinomialNB(alpha=alpha)        
            model = PoolingMNB()
        elif model_type=='Zaidan_etal':
            random_state = np.random.RandomState(seed=seed)        
            model = svm.SVC(kernel='linear', C=svm_C, random_state=random_state)                                                          

        if model_type=='Melville_etal':                            
            instance_model.fit(X_train, y_train)
            model.fit(instance_model, feature_model, weights=poolingMNBWeights) # train pooling_model
        elif model_type=='Zaidan_etal':
            model.fit(X_train, np.array(y_train), sample_weight=sample_weight)
        else:
            model.fit(X_train, np.array(y_train))
            
        (accu, auc) = evaluate_model(model, X_test, y_test)
        model_scores['auc'].append(auc)
        model_scores['accu'].append(accu)
        
        num_training_samples.append(number_of_docs)
        
  
    print 'Active Learning took %2.2fs' % (time() - start)
    
    return (np.array(num_training_samples), model_scores)
示例#9
0
                           X_pool_docs)

        print '=' * 50
        ch = raw_input(
            'Display the next document? Press Enter to continue or type \'n\' to exit...  '
        )

        if ch == 'n':
            break

    t0 = time()
    print '-' * 50
    print 'Starting to train feature_model(MNB)...'
    feature_model = FeatureMNBUniform([], [],
                                      num_feat=X_pool.shape[1],
                                      smoothing=1e-6,
                                      class_prior=[0.5, 0.5],
                                      r=100.)

    for doc in range(X_pool.shape[0]):
        feature = fe.most_informative_feature(X_pool[doc], y_pool[doc])
        if feature:
            feature_model.fit(feature,
                              y_pool[doc])  # train feature_model one by one

    print 'Feature Model(MNB): accu = %f, auc = %f' % evaluate_model(
        feature_model, X_test, y_test)
    print 'Training feature_model(MNB) took %5.2f' % (time() - t0)

    logit = linear_model.LogisticRegression(C=args.c, penalty='l1')
    logit.fit(X_pool, y_pool)
 (X_pool, y_pool, X_test, y_test, feat_names) = load_dataset(args.dataset)
 
 models = {'MultinomialNB(alpha=1)':MultinomialNB(alpha=1), \
           'LogisticRegression(C=1, penalty=\'l1\')':LogisticRegression(C=1, penalty='l1'), \
           'LogisticRegression(C=0.1, penalty=\'l1\')':LogisticRegression(C=0.1, penalty='l1')}
 
 aucs = {}
 
 for mk in models.keys():
     models[mk].fit(X_pool, y_pool)
     _, auc = evaluate_model(models[mk], X_test, y_test)
     aucs[mk] = auc
 
 fe = feature_expert(X_pool, y_pool, metric="L1", C=args.c)
 
 all_feature_model = FeatureMNBUniform(fe.feature_rank[0], fe.feature_rank[1], fe.num_features, smoothing=args.smoothing)
 all_feature_model.update()
 
 _, all_auc = evaluate_model(all_feature_model, X_test, y_test)
 
 
 k_feature_model = FeatureMNBUniform(fe.feature_rank[0][:args.k], fe.feature_rank[1][:args.k], fe.num_features, smoothing=args.smoothing)
 k_feature_model.update()
 
 _, k_auc = evaluate_model(k_feature_model, X_test, y_test)
 
 
 
 
 print '-' * 50
 
示例#11
0
def learn(model_type, X_pool, y_pool, X_test, y_test, training_set, pool_set, feature_expert, \
          selection_strategy, budget, step_size, topk, w_o, w_r, seed=0, alpha=1, smoothing=0, poolingMNBWeights=[0.5, 0.5], poolingFM_r=100.0, lr_C=1, svm_C=1, \
          zaidan_C=1, zaidan_Ccontrast=1, zaidan_nu=1, cvTrain=False, Debug=False):
    
    start = time()
    print '-' * 50
    print 'Starting Active Learning...'
    
    _, num_feat = X_pool.shape
    model_scores = {'auc':[], 'accu':[], 'wr':[], 'wo':[], 'alpha':[], 'svm_C':[], 'zaidan_C':[], 'zaidan_Ccontrast':[], 'zaidan_nu':[], 'FMrvalue':[], 'IMweight':[], 'FMweight':[]}
    
    rationales  = set()
    rationales_c0  = set()
    rationales_c1  = set()

    number_of_docs = 0    

    feature_expert.rg.seed(seed)
    
    num_training_samples = []
    
    all_features=[]
    
    # keep all the training data instance ids in docs list    
    
    docs = training_set          
    
    X_train = None
    y_train = []
    sample_weight = []
      

    for doc_id in docs:
        #feature = feature_expert.most_informative_feature(X_pool[doc_id], y_pool[doc_id])
        feature = feature_expert.any_informative_feature(X_pool[doc_id], y_pool[doc_id])         
        
        number_of_docs=number_of_docs+1       
        
        # append feature to all_features, even if it is None
        all_features.append(feature)                     

        if feature is not None:
            rationales.add(feature)

            if y_pool[doc_id] == 0:
                rationales_c0.add(feature)
            else:
                rationales_c1.add(feature)

    if cvTrain:
        # get optimal parameters depending on the model_type

        if model_type=='mnb_LwoR':
            w_r, w_o=optimalMNBLwoRParameters(X_pool[training_set], y_pool[training_set], all_features)

            feature_counter=0
            for doc_id in docs:
                x = sp.csr_matrix(X_pool[doc_id], dtype=float)
                            
                x_feats = x[0].indices
                for f in x_feats:
                    if f == all_features[feature_counter]:
                        x[0,f] = w_r*x[0,f]
                    else:
                        x[0,f] = w_o*x[0,f]
                feature_counter=feature_counter+1
                if not y_train:
                    X_train = x
                else:
                    X_train = sp.vstack((X_train, x))
        
                y_train.append(y_pool[doc_id])

                
        elif model_type=='mnb':      
            w_r, w_o=optimalMNBParameters(X_pool[training_set], y_pool[training_set], all_features)

            feature_counter=0
            for doc_id in docs:
                x = sp.csr_matrix(X_pool[doc_id], dtype=float)
                            
                x_feats = x[0].indices
                for f in x_feats:
                    if f == all_features[feature_counter]:
                        x[0,f] = w_r*x[0,f]
                    else:
                        x[0,f] = w_o*x[0,f]
                feature_counter=feature_counter+1
                if not y_train:
                    X_train = x
                else:
                    X_train = sp.vstack((X_train, x))
        
                y_train.append(y_pool[doc_id])

        
        elif model_type=='svm_linear':                  
            w_r, w_o, C= optimalSVMParameters(X_pool[training_set], y_pool[training_set], all_features, seed)            

            feature_counter=0
            for doc_id in docs:
                x = sp.csr_matrix(X_pool[doc_id], dtype=float)
                            
                x_feats = x[0].indices
                for f in x_feats:
                    if f == all_features[feature_counter]:
                        x[0,f] = w_r*x[0,f]
                    else:
                        x[0,f] = w_o*x[0,f]
                feature_counter=feature_counter+1
                if not y_train:
                    X_train = x
                else:
                    X_train = sp.vstack((X_train, x))
        
                y_train.append(y_pool[doc_id])       
                
        elif model_type=='svm_linear_LwoR':                  

            w_r, w_o, C= optimalSVMLwoRParameters(X_pool[training_set], y_pool[training_set], all_features, seed)
            feature_counter=0
            for doc_id in docs:
                x = sp.csr_matrix(X_pool[doc_id], dtype=float)
                            
                x_feats = x[0].indices
                for f in x_feats:
                    if f == all_features[feature_counter]:
                        x[0,f] = w_r*x[0,f]
                    else:
                        x[0,f] = w_o*x[0,f]
                feature_counter=feature_counter+1
                if not y_train:
                    X_train = x
                else:
                    X_train = sp.vstack((X_train, x))
        
                y_train.append(y_pool[doc_id])                                    
            

        if model_type=='poolingMNB':   
            classpriors=np.zeros(2)            
            classpriors[1]=(np.sum(y_pool[docs])*1.)/(len(docs)*1.)
            classpriors[0]= 1. - classpriors[1]     
            
            alpha, poolingFM_r, poolingMNBWeights = optimalPoolingMNBParameters(X_pool[training_set], y_pool[training_set], all_features, smoothing, num_feat)
            
            feature_model=FeatureMNBUniform(rationales_c0, rationales_c1, num_feat, smoothing, classpriors, poolingFM_r)

            feature_counter=0
            for doc_id in docs:
                if all_features[feature_counter]:
                    # updates feature model with features one at a time
                    feature_model.fit(all_features[feature_counter], y_pool[doc_id])
                feature_counter=feature_counter+1

                x = sp.csr_matrix(X_pool[doc_id], dtype=float)

                if not y_train:
                    X_train = x
                else:
                    X_train = sp.vstack((X_train, x))
        
                y_train.append(y_pool[doc_id])

        if model_type=='Zaidan':

            zaidan_C, zaidan_Ccontrast, zaidan_nu = optimalZaidanParameters(X_pool[training_set], y_pool[training_set], all_features, seed)

            feature_counter=0

            for doc_id in docs:
                x = sp.csr_matrix(X_pool[doc_id], dtype=np.float64)
                if all_features[feature_counter] is not None:
                    x_pseudo = (X_pool[doc_id]).todense()
                                
                    # create pseudoinstances based on rationales provided; one pseudoinstance is created for each rationale.
                    x_feats = x[0].indices
        
                    for f in x_feats:
                        if f == all_features[feature_counter]:
                            test= x[0,f]
                            x_pseudo[0,f] = x[0,f]/zaidan_nu
                        else:                                              
                            x_pseudo[0,f] = 0.0                          
                    x_pseudo=sp.csr_matrix(x_pseudo, dtype=np.float64)                
        
                if not y_train:
                    X_train = x      
                    if all_features[feature_counter] is not None:      
                        X_train = sp.vstack((X_train, x_pseudo))
                else:
                    X_train = sp.vstack((X_train, x))
                    if all_features[feature_counter] is not None:
                        X_train = sp.vstack((X_train, x_pseudo))
        
                y_train.append(y_pool[doc_id])
                if all_features[feature_counter] is not None:
                    # append y label again for the pseudoinstance created
                    y_train.append(y_pool[doc_id])
        

                sample_weight.append(zaidan_C)
                if all_features[feature_counter] is not None:
                    # append instance weight=zaidan_Ccontrast for the pseudoinstance created
                    sample_weight.append(zaidan_Ccontrast)  

                feature_counter = feature_counter+1
    
    # Train the model
    
    if model_type=='lrl2':
        random_state = np.random.RandomState(seed=seed)
        model = LogisticRegression(C=lr_C, penalty='l2', random_state=random_state)
    elif model_type=='lrl1':
        random_state = np.random.RandomState(seed=seed)
        model = LogisticRegression(C=lr_C, penalty='l1', random_state=random_state)        
    elif model_type=='mnb':        
        model = MultinomialNB(alpha=alpha)        
    elif model_type=='mnb_LwoR':        
        model = MultinomialNB(alpha=alpha)  
    elif model_type=='svm_linear_LwoR':        
        random_state = np.random.RandomState(seed=seed)
        model = LinearSVC(C=C, random_state=random_state)
    elif model_type=='svm_linear':
        random_state = np.random.RandomState(seed=seed)
        model = LinearSVC(C=C, random_state=random_state)
    elif model_type=='poolingMNB':
        instance_model=MultinomialNB(alpha=alpha)        
        model = PoolingMNB()
    elif model_type=='Zaidan':
        random_state = np.random.RandomState(seed=seed)        
        model = svm.SVC(kernel='linear', C=1.0, random_state=random_state)
        
    if model_type=='poolingMNB':                        
        instance_model.fit(X_train, y_train)
        model.fit(instance_model, feature_model, weights=poolingMNBWeights) # train pooling_model
    elif model_type=='Zaidan':
        model.fit(X_train, np.array(y_train), sample_weight=sample_weight)
    else:
        model.fit(X_train, np.array(y_train))
    
    
            
    (accu, auc) = evaluate_model(model, X_test, y_test)
    model_scores['auc'].append(auc)
    model_scores['accu'].append(accu)
    if model_type=='poolingMNB':
        model_scores['alpha'].append(alpha)
        model_scores['FMrvalue'].append(poolingFM_r)
        model_scores['IMweight'].append(poolingMNBWeights[0])
        model_scores['FMweight'].append(poolingMNBWeights[1])
    else:
        model_scores['FMrvalue'].append(0.0)
        model_scores['IMweight'].append(0.0)
        model_scores['FMweight'].append(0.0)

    if model_type=='Zaidan':
        model_scores['zaidan_C'].append(zaidan_C)
        model_scores['zaidan_Ccontrast'].append(zaidan_Ccontrast)
        model_scores['zaidan_nu'].append(zaidan_nu)
    else:
        model_scores['zaidan_C'].append(0.0)
        model_scores['zaidan_Ccontrast'].append(0.0)
        model_scores['zaidan_nu'].append(0.0)

    if model_type=='mnb' or model_type=='mnb_LwoR':
        model_scores['alpha'].append(alpha)        
    else:
        model_scores['alpha'].append(0.0)        

    if model_type=='svm_linear' or model_type=='svm_linear_LwoR':
        model_scores['svm_C'].append(C)        
    else:
        model_scores['svm_C'].append(0.0)
        
    if model_type=='svm_linear' or model_type=='svm_linear_LwoR' or model_type=='mnb' or model_type=='mnb_LwoR':
        model_scores['wr'].append(w_r)
        model_scores['wo'].append(w_o)
    else:
        model_scores['wr'].append(0.0)
        model_scores['wo'].append(0.0)
    
    num_training_samples.append(number_of_docs)
    
    feature_expert.rg.seed(seed)        
    
    if selection_strategy == 'random':
        doc_pick_model = RandomStrategy(seed)
    elif selection_strategy == 'unc':
        doc_pick_model = UNCSampling()         
    elif selection_strategy == "pnc":
        doc_pick_model = UNCPreferNoConflict()
    elif selection_strategy == "pnr":
        doc_pick_model = UNCPreferNoRationale()
    elif selection_strategy == "pr":
        doc_pick_model = UNCPreferRationale()
    elif selection_strategy == "pc":
        doc_pick_model = UNCPreferConflict()
    elif selection_strategy == "tt":
        doc_pick_model = UNCThreeTypes()
    elif selection_strategy == "pipe":
        doc_pick_model = Pipe([UNCSampling(), UNCPreferConflict()], [10, 30])
    else:
        raise ValueError('Selection strategy: \'%s\' invalid!' % selection_strategy)
 
  
    k = step_size  


    #while X_train.shape[0] < budget:     
    while number_of_docs < budget:                       

        # Choose a document based on the strategy chosen
        if selection_strategy == "pnc":
            doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k, rationales_c0, rationales_c1, topk)
        elif selection_strategy == "pnr":
            doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k, rationales_c0, rationales_c1, topk)
        elif selection_strategy == "pr":
            doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k, rationales_c0, rationales_c1, topk)
        elif selection_strategy == "pc":
            doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k, rationales_c0, rationales_c1, topk)
        elif selection_strategy == "tt":
            doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k, rationales_c0, rationales_c1, topk)
        elif selection_strategy == "pipe":
            doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k, rationales_c0, rationales_c1, topk)
        else:
            doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k)
        
        if doc_ids is None or len(doc_ids) == 0:
            break        

        

        for doc_id in doc_ids:            
            #feature = feature_expert.most_informative_feature(X_pool[doc_id], y_pool[doc_id])
            feature = feature_expert.any_informative_feature(X_pool[doc_id], y_pool[doc_id])

            all_features.append(feature)                
            
            number_of_docs=number_of_docs + 1
        
            if feature is not None:
                rationales.add(feature)

                if y_pool[doc_id] == 0:
                    rationales_c0.add(feature)
                else:
                    rationales_c1.add(feature)

            # Remove the chosen document from pool and add it to the training set            
            pool_set.remove(doc_id)                        
            training_set.append(long(doc_id))               


        if cvTrain:
        # get optimal parameters depending on the model_type

            X_train = None
            y_train = []
            sample_weight = []

            if model_type=='mnb_LwoR':
                if np.mod(number_of_docs,20)==10:
                    w_r, w_o=optimalMNBLwoRParameters(X_pool[training_set], y_pool[training_set], all_features)

                feature_counter=0
                for doc_id in training_set:
                    x = sp.csr_matrix(X_pool[doc_id], dtype=float)
                            
                    x_feats = x[0].indices
                    for f in x_feats:
                        if f == all_features[feature_counter]:
                            x[0,f] = w_r*x[0,f]
                        else:
                            x[0,f] = w_o*x[0,f]
                    feature_counter=feature_counter+1
                    if not y_train:
                        X_train = x
                    else:
                        X_train = sp.vstack((X_train, x))
        
                    y_train.append(y_pool[doc_id])

                
            elif model_type=='mnb':      
                if np.mod(number_of_docs,20)==10:
                    w_r, w_o=optimalMNBParameters(X_pool[training_set], y_pool[training_set], all_features)

                feature_counter=0
                for doc_id in training_set:
                    x = sp.csr_matrix(X_pool[doc_id], dtype=float)
                            
                    x_feats = x[0].indices
                    for f in x_feats:
                        if f == all_features[feature_counter]:
                            x[0,f] = w_r*x[0,f]
                        else:
                            x[0,f] = w_o*x[0,f]
                    feature_counter=feature_counter+1
                    if not y_train:
                        X_train = x
                    else:
                        X_train = sp.vstack((X_train, x))
        
                    y_train.append(y_pool[doc_id])

        
            elif model_type=='svm_linear_LwoR': 
                if np.mod(number_of_docs,20)==10:                 
                    w_r, w_o, C= optimalSVMLwoRParameters(X_pool[training_set], y_pool[training_set], all_features, seed)
                
                feature_counter=0
                for doc_id in training_set:
                    x = sp.csr_matrix(X_pool[doc_id], dtype=np.float64)
                            
                    x_feats = x[0].indices
                    for f in x_feats:
                        if f == all_features[feature_counter]:
                            x[0,f] = w_r*x[0,f]
                        else:
                            x[0,f] = w_o*x[0,f]

                    feature_counter=feature_counter+1

                    if not y_train:
                        X_train = x
                    else:
                        X_train = sp.vstack((X_train, x))
        
                    y_train.append(y_pool[doc_id])       
                
            elif model_type=='svm_linear':                  
                if np.mod(number_of_docs,20)==10:
                    w_r, w_o, C= optimalSVMParameters(X_pool[training_set], y_pool[training_set], all_features, seed)

                feature_counter=0
                for doc_id in training_set:
                    x = sp.csr_matrix(X_pool[doc_id], dtype=np.float64)
                            
                    x_feats = x[0].indices
                    for f in x_feats:
                        if f == all_features[feature_counter]:
                            x[0,f] = w_r*x[0,f]
                        else:
                            x[0,f] = w_o*x[0,f]

                    feature_counter=feature_counter+1

                    if not y_train:
                        X_train = x
                    else:
                        X_train = sp.vstack((X_train, x))
        
                    y_train.append(y_pool[doc_id])                                    
            

            if model_type=='poolingMNB':   
                classpriors=np.zeros(2)            
                classpriors[1]=(np.sum(y_pool[docs])*1.)/(len(docs)*1.)
                classpriors[0]= 1. - classpriors[1]     
                
                if np.mod(number_of_docs,20)==10:
                    alpha, poolingFM_r, poolingMNBWeights = optimalPoolingMNBParameters(X_pool[training_set], y_pool[training_set], all_features, smoothing, num_feat)
            
                feature_model=FeatureMNBUniform(rationales_c0, rationales_c1, num_feat, smoothing, classpriors, poolingFM_r)

                feature_counter=0
                for doc_id in training_set:
                    if all_features[feature_counter]:
                        # updates feature model with features one at a time
                        feature_model.fit(all_features[feature_counter], y_pool[doc_id])
                    feature_counter=feature_counter+1

                    x = sp.csr_matrix(X_pool[doc_id], dtype=float)

                    if not y_train:
                        X_train = x
                    else:
                        X_train = sp.vstack((X_train, x))
        
                    y_train.append(y_pool[doc_id])

            if model_type=='Zaidan':

                if np.mod(number_of_docs,20)==10:
                    zaidan_C, zaidan_Ccontrast, zaidan_nu = optimalZaidanParameters(X_pool[training_set], y_pool[training_set], all_features, seed)
                
                feature_counter=0

                for doc_id in training_set:
                    x = sp.csr_matrix(X_pool[doc_id], dtype=np.float64)

                    if all_features[feature_counter] is not None:
                        x_pseudo = (X_pool[doc_id]).todense()
                                
                        # create pseudoinstances based on rationales provided; one pseudoinstance is created for each rationale.
                        x_feats = x[0].indices
        
                        for f in x_feats:
                            if f == all_features[feature_counter]:
                                test= x[0,f]
                                x_pseudo[0,f] = x[0,f]/zaidan_nu
                            else:                                              
                                x_pseudo[0,f] = 0.0                
                                          
                        x_pseudo=sp.csr_matrix(x_pseudo, dtype=np.float64)                
        
                    if not y_train:
                        X_train = x      
                        if all_features[feature_counter] is not None:      
                            X_train = sp.vstack((X_train, x_pseudo))
                    else:
                        X_train = sp.vstack((X_train, x))
                        if all_features[feature_counter] is not None:
                            X_train = sp.vstack((X_train, x_pseudo))
        
                    y_train.append(y_pool[doc_id])
                    if all_features[feature_counter] is not None:
                        # append y label again for the pseudoinstance created
                        y_train.append(y_pool[doc_id])
        

                    sample_weight.append(zaidan_C)
                    if all_features[feature_counter] is not None:
                        # append instance weight=zaidan_Ccontrast for the pseudoinstance created
                        sample_weight.append(zaidan_Ccontrast)  

                    feature_counter = feature_counter+1
        
        # Train the model

        if model_type=='lrl2':
            random_state = np.random.RandomState(seed=seed)
            model = LogisticRegression(C=lr_C, penalty='l2', random_state=random_state)
        elif model_type=='lrl1':
            random_state = np.random.RandomState(seed=seed)
            model = LogisticRegression(C=lr_C, penalty='l1', random_state=random_state)        
        elif model_type=='mnb':        
            model = MultinomialNB(alpha=alpha)        
        elif model_type=='mnb_LwoR':        
            model = MultinomialNB(alpha=alpha)  
        elif model_type=='svm_linear_LwoR':        
            random_state = np.random.RandomState(seed=seed)
            model = LinearSVC(C=C, random_state=random_state)
        elif model_type=='svm_linear':
            random_state = np.random.RandomState(seed=seed)
            model = LinearSVC(C=C, random_state=random_state)
        elif model_type=='poolingMNB':
            instance_model=MultinomialNB(alpha=alpha)        
            model = PoolingMNB()
        elif model_type=='Zaidan':
            random_state = np.random.RandomState(seed=seed)        
            model = svm.SVC(kernel='linear', C=svm_C, random_state=random_state)
        
        if model_type=='poolingMNB':                        
            instance_model.fit(X_train, y_train)
            model.fit(instance_model, feature_model, weights=poolingMNBWeights) # train pooling_model
        elif model_type=='Zaidan':
            model.fit(X_train, np.array(y_train), sample_weight=sample_weight)
        else:
            model.fit(X_train, np.array(y_train))
        
                    
        (accu, auc) = evaluate_model(model, X_test, y_test)
        model_scores['auc'].append(auc)
        model_scores['accu'].append(accu)
        if model_type=='poolingMNB':
            model_scores['alpha'].append(alpha)
            model_scores['FMrvalue'].append(poolingFM_r)
            model_scores['IMweight'].append(poolingMNBWeights[0])
            model_scores['FMweight'].append(poolingMNBWeights[1])
        else:
            model_scores['FMrvalue'].append(0.0)
            model_scores['IMweight'].append(0.0)
            model_scores['FMweight'].append(0.0)

        if model_type=='Zaidan':
            model_scores['zaidan_C'].append(zaidan_C)
            model_scores['zaidan_Ccontrast'].append(zaidan_Ccontrast)
            model_scores['zaidan_nu'].append(zaidan_nu)
        else:
            model_scores['zaidan_C'].append(0.0)
            model_scores['zaidan_Ccontrast'].append(0.0)
            model_scores['zaidan_nu'].append(0.0)

        if model_type=='mnb' or model_type=='mnb_LwoR':
            model_scores['alpha'].append(alpha)        
        else:
            model_scores['alpha'].append(0.0)        

        if model_type=='svm_linear' or model_type=='svm_linear_LwoR':
            model_scores['svm_C'].append(C)        
        else:
            model_scores['svm_C'].append(0.0)
        
        if model_type=='svm_linear' or model_type=='svm_linear_LwoR' or model_type=='mnb' or model_type=='mnb_LwoR':
            model_scores['wr'].append(w_r)
            model_scores['wo'].append(w_o)
        else:
            model_scores['wr'].append(0.0)
            model_scores['wo'].append(0.0)
        
        num_training_samples.append(number_of_docs)
        
  
    print 'Active Learning took %2.2fs' % (time() - start)
    
    return (np.array(num_training_samples), model_scores)
示例#12
0
    models = {'MultinomialNB(alpha=1)':MultinomialNB(alpha=1), \
              'LogisticRegression(C=1, penalty=\'l1\')':LogisticRegression(C=1, penalty='l1'), \
              'LogisticRegression(C=0.1, penalty=\'l1\')':LogisticRegression(C=0.1, penalty='l1')}

    aucs = {}

    for mk in models.keys():
        models[mk].fit(X_pool, y_pool)
        _, auc = evaluate_model(models[mk], X_test, y_test)
        aucs[mk] = auc

    fe = feature_expert(X_pool, y_pool, metric="L1", C=args.c)

    all_feature_model = FeatureMNBUniform(fe.feature_rank[0],
                                          fe.feature_rank[1],
                                          fe.num_features,
                                          smoothing=args.smoothing)
    all_feature_model.update()

    _, all_auc = evaluate_model(all_feature_model, X_test, y_test)

    k_feature_model = FeatureMNBUniform(fe.feature_rank[0][:args.k],
                                        fe.feature_rank[1][:args.k],
                                        fe.num_features,
                                        smoothing=args.smoothing)
    k_feature_model.update()

    _, k_auc = evaluate_model(k_feature_model, X_test, y_test)

    print '-' * 50
 def choice(self, X, y, pool, train_indices, current_feature_model, current_reasoning_model, w_n, w_a):
     
     rand_indices = self.rgen.permutation(len(pool))
     candidates = [pool[i] for i in rand_indices[:self.sub_pool]]
     
     aucs = []
     
     for doc in candidates:
         new_train_indices = list(train_indices)
         new_train_indices.append(doc)
         
         # train an instance model
         instance_model = MultinomialNB(alpha=1.)
         instance_model.fit(X[new_train_indices], y[new_train_indices])
                  
         # train a feature model
         
         feature_model = None
         if isinstance(current_feature_model, FeatureMNBUniform):
             feature_model = FeatureMNBUniform(current_feature_model.class0_feats, current_feature_model.class1_feats, self.feature_expert.num_features, 0)
         elif isinstance(current_feature_model, FeatureMNBWeighted):
             feature_model = FeatureMNBWeighted(num_feat = self.feature_expert.num_features, feat_count = current_feature_model.feature_count_, imaginary_counts = current_feature_model.imaginary_counts)
         else:
             raise ValueError('Feature model type: \'%s\' unknown!' % current_feature_model.__class__.__name__)
         
         top_feat = self.feature_expert.most_informative_feature(X[doc], y[doc])
         
         if top_feat:
             feature_model.fit(top_feat, y[doc]) # fit also calls update; so there is no need to update again
         else:
             feature_model.update()
             
         # make a deep copy of the reasoning model and partial train it
         
         reasoning_model = copy.deepcopy(current_reasoning_model)
         reasoning_model.partial_fit(X[doc], y[doc], top_feat, w_n, w_a)
         
         # pooling model
         
         pooling_model = PoolingMNB()
         pooling_model.fit(instance_model, feature_model, weights=[0.5, 0.5])
         
         # evaluate
         
         opt_model = None
         
         if self.optimize == "P":
             opt_model = pooling_model
         elif self.optimize == "I":
             opt_model = instance_model
         elif self.optimize == "F":
             opt_model = feature_model
         elif self.optimize == "R":
             opt_model = reasoning_model
         else:
             raise ValueError('Optimization Model: \'%s\' invalid!' % self.optimize)
         
         y_probas = opt_model.predict_proba(self.X_test)
         
         auc = metrics.roc_auc_score(self.y_test, y_probas[:, 1])
         aucs.append(auc)
             
     doc_id = candidates[np.argsort(aucs)[-1]]
     
     return doc_id
示例#14
0
def optimalPoolingMNBParameters(X_pool, y_pool, all_features, smoothing, num_feat):

    grid_alpha=[0.01, 0.1, 1.0, 10.0, 100.0]    
    grid_rValue=[100.0, 1000.0]
    grid_IMweight=[0.99, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1, 0.01]    
    # search same values for wo, because this is learning without rationales

    max_auc=-1
    optimal_alpha=-1
    optimal_rValue=-1
    optimal_model_weights=np.zeros(2)
    optimal_model_weights[0]=-1
    optimal_model_weights[1]=-1

    for alpha in grid_alpha:    
        
        for rValue in grid_rValue:         
            
            for IMweight in grid_IMweight:                                      

                all_probabilities=np.ndarray(shape=(len(y_pool),2))
        
                kf = KFold(len(y_pool), n_folds=5)
        
                for train, test in kf:                            

                    X_train = None
                    y_train = []
                    
                    rationales_c0  = set()
                    rationales_c1  = set()

                    poolingMNBWeights=np.zeros(2)

                    #for doc_id in train:
                    #    if y_pool[doc_id] == 0:
                    #        rationales_c0.add(all_features[doc_id])
                    #    else:
                    #        rationales_c1.add(all_features[doc_id])

                    feature_model=FeatureMNBUniform(rationales_c0, rationales_c1, num_feat, smoothing, [0.5, 0.5], rValue)                    

                    for doc_id in train:

                        if all_features[doc_id]:
                            feature_model.fit(all_features[doc_id], y_pool[doc_id])

                        x = sp.csr_matrix(X_pool[doc_id], dtype=float)                                        
                
                        if not y_train:
                            X_train = x
                        else:
                            X_train = sp.vstack((X_train, x))
        
                        y_train.append(y_pool[doc_id])
    
                            
                    instance_model=MultinomialNB(alpha=alpha)                            
                    instance_model.fit(X_train, y_train)

                    model = PoolingMNB()
                    weights=np.zeros(2)
                    poolingMNBWeights[0]=IMweight
                    poolingMNBWeights[1]=1. - IMweight
                    model.fit(instance_model, feature_model, weights=poolingMNBWeights) # train pooling_model

                    X_test=X_pool[test]
                    y_test=y_pool[test]

                    y_probas = model.predict_proba(X_test)                    
            
                    counter=0
                    for t in test:
                        all_probabilities[t]=y_probas[counter]
                        counter=counter+1
        
                # compute AUC based on all instances in the training data                                            
                auc = metrics.roc_auc_score(y_pool, all_probabilities[:, 1])
            
            
                if auc > max_auc:
                    max_auc = auc
                    optimal_alpha = alpha
                    optimal_rValue = rValue
                    optimal_model_weights[0] = IMweight                   
                    optimal_model_weights[1] = 1. - IMweight  


    return optimal_alpha, optimal_rValue, optimal_model_weights