def run_trials(num_trials, dataset, selection_strategy, metric, C, alpha, smoothing, \ bootstrap_size, balance, coverage, disagree_strat, budget, fmtype, rmw_n, rmw_a, seed=0, Debug=False, \ reasoning_strategy='random', switch=40): (X_pool, y_pool, X_test, y_test, feat_names) = load_dataset(dataset) if not feat_names: feat_names = np.arange(X_pool.shape[1]) feat_freq = np.diff(X_pool.tocsc().indptr) fe = feature_expert(X_pool, y_pool, metric, smoothing=1e-6, C=C, pick_only_top=True) result = np.ndarray(num_trials, dtype=object) for i in range(num_trials): print '-' * 50 print 'Starting Trial %d of %d...' % (i + 1, num_trials) trial_seed = seed + i # initialize the seed for the trial instance_model = MultinomialNB(alpha=alpha) feature_model = None if fmtype == "fm_uniform": feature_model = FeatureMNBUniform([], [], fe.num_features, smoothing) elif fmtype == "fm_weighted": feature_model = FeatureMNBWeighted(num_feat=fe.num_features, imaginary_counts=1.) else: raise ValueError('Feature model type: \'%s\' invalid!' % fmtype) pooling_model = PoolingMNB() reasoning_model = ReasoningMNB(alpha=1) if bootstrap_size == 0: training_set = [] pool_set = range(X_pool.shape[0]) else: training_set, pool_set = RandomBootstrap(X_pool, y_pool, bootstrap_size, balance, trial_seed) result[i] = learn(X_pool, y_pool, X_test, y_test, training_set, pool_set, \ fe, selection_strategy, disagree_strat, coverage, budget, instance_model, \ feature_model, pooling_model, reasoning_model, rmw_n, rmw_a, trial_seed, Debug, \ reasoning_strategy, switch) return result, feat_names, feat_freq
def full_knowledge(dataset, metric='mutual_info', C=0.1, alpha=1, smoothing=0): ''' This function uses the entire pool to learn the instance model, feature model, and pooling model which provides an upper bound on how well these models can perform on this particular dataset ''' (X_pool, y_pool, X_test, y_test, feat_names) = load_dataset(dataset) fe = feature_expert(X_pool, y_pool, metric, smoothing=1e-6, C=C) instance_model_scores = {'auc': [], 'accu': []} feature_model_scores = {'auc': [], 'accu': []} pooling_model_scores = {'auc': [], 'accu': []} instance_model = MultinomialNB(alpha=alpha) feature_model = FeatureMNBUniform([], [], fe.num_features, smoothing) pooling_model = PoolingMNB() instance_model.fit(X_pool, y_pool) for doc in range(X_pool.shape[0]): feature = fe.most_informative_feature(X_pool[doc], y_pool[doc]) feature_model.fit(feature, y_pool[doc]) # Evaluate performance based on Instance Model (accu, auc) = evaluate_model(instance_model, X_test, y_test) instance_model_scores['auc'].append(auc) instance_model_scores['accu'].append(accu) print 'Instance Model: auc = %f, accu = %f' % (auc, accu) # Evaluate performance on Feature Model (accu, auc) = evaluate_model(feature_model, X_test, y_test) feature_model_scores['auc'].append(auc) feature_model_scores['accu'].append(accu) print 'Feature Model: auc = %f, accu = %f' % (auc, accu) # Evaluate performance on Pooled Model pooling_model.fit(instance_model, feature_model, weights=[0.5, 0.5]) (accu, auc) = evaluate_model(pooling_model, X_test, y_test) pooling_model_scores['auc'].append(auc) pooling_model_scores['accu'].append(accu) print 'Pooled Model: auc = %f, accu = %f' % (auc, accu)
def full_knowledge(dataset, metric='mutual_info', C=0.1, alpha=1, smoothing=0): ''' This function uses the entire pool to learn the instance model, feature model, and pooling model which provides an upper bound on how well these models can perform on this particular dataset ''' (X_pool, y_pool, X_test, y_test, feat_names) = load_dataset(dataset) fe = feature_expert(X_pool, y_pool, metric, smoothing=1e-6, C=C) instance_model_scores = {'auc':[], 'accu':[]} feature_model_scores = {'auc':[], 'accu':[]} pooling_model_scores = {'auc':[], 'accu':[]} instance_model = MultinomialNB(alpha=alpha) feature_model = FeatureMNBUniform([], [], fe.num_features, smoothing) pooling_model = PoolingMNB() instance_model.fit(X_pool, y_pool) for doc in range(X_pool.shape[0]): feature = fe.most_informative_feature(X_pool[doc], y_pool[doc]) feature_model.fit(feature, y_pool[doc]) # Evaluate performance based on Instance Model (accu, auc) = evaluate_model(instance_model, X_test, y_test) instance_model_scores['auc'].append(auc) instance_model_scores['accu'].append(accu) print 'Instance Model: auc = %f, accu = %f' % (auc, accu) # Evaluate performance on Feature Model (accu, auc) = evaluate_model(feature_model, X_test, y_test) feature_model_scores['auc'].append(auc) feature_model_scores['accu'].append(accu) print 'Feature Model: auc = %f, accu = %f' % (auc, accu) # Evaluate performance on Pooled Model pooling_model.fit(instance_model, feature_model, weights=[0.5, 0.5]) (accu, auc) = evaluate_model(pooling_model, X_test, y_test) pooling_model_scores['auc'].append(auc) pooling_model_scores['accu'].append(accu) print 'Pooled Model: auc = %f, accu = %f' % (auc, accu)
def learn(model_type, X_pool, y_pool, X_test, y_test, training_set, pool_set, feature_expert, \ selection_strategy, budget, step_size, topk, w_o, w_r, seed=0, alpha=1, smoothing=0, poolingMNBWeights=[0.5, 0.5], poolingFM_r=100.0, lr_C=1, svm_C=1, \ zaidan_C=1, zaidan_Ccontrast=1, zaidan_nu=1, cvTrain=False, Debug=False): start = time() print '-' * 50 print 'Starting Active Learning...' _, num_feat = X_pool.shape model_scores = { 'auc': [], 'accu': [], 'wr': [], 'wo': [], 'alpha': [], 'svm_C': [], 'zaidan_C': [], 'zaidan_Ccontrast': [], 'zaidan_nu': [], 'FMrvalue': [], 'IMweight': [], 'FMweight': [] } rationales = set() rationales_c0 = set() rationales_c1 = set() number_of_docs = 0 feature_expert.rg.seed(seed) num_training_samples = [] all_features = [] # keep all the training data instance ids in docs list docs = training_set X_train = None y_train = [] sample_weight = [] for doc_id in docs: #feature = feature_expert.most_informative_feature(X_pool[doc_id], y_pool[doc_id]) feature = feature_expert.any_informative_feature( X_pool[doc_id], y_pool[doc_id]) number_of_docs = number_of_docs + 1 # append feature to all_features, even if it is None all_features.append(feature) if feature is not None: rationales.add(feature) if y_pool[doc_id] == 0: rationales_c0.add(feature) else: rationales_c1.add(feature) if cvTrain: # get optimal parameters depending on the model_type if model_type == 'mnb_LwoR': w_r, w_o = optimalMNBLwoRParameters(X_pool[training_set], y_pool[training_set], all_features) feature_counter = 0 for doc_id in docs: x = sp.csr_matrix(X_pool[doc_id], dtype=float) x_feats = x[0].indices for f in x_feats: if f == all_features[feature_counter]: x[0, f] = w_r * x[0, f] else: x[0, f] = w_o * x[0, f] feature_counter = feature_counter + 1 if not y_train: X_train = x else: X_train = sp.vstack((X_train, x)) y_train.append(y_pool[doc_id]) elif model_type == 'mnb': w_r, w_o = optimalMNBParameters(X_pool[training_set], y_pool[training_set], all_features) feature_counter = 0 for doc_id in docs: x = sp.csr_matrix(X_pool[doc_id], dtype=float) x_feats = x[0].indices for f in x_feats: if f == all_features[feature_counter]: x[0, f] = w_r * x[0, f] else: x[0, f] = w_o * x[0, f] feature_counter = feature_counter + 1 if not y_train: X_train = x else: X_train = sp.vstack((X_train, x)) y_train.append(y_pool[doc_id]) elif model_type == 'svm_linear': w_r, w_o, C = optimalSVMParameters(X_pool[training_set], y_pool[training_set], all_features, seed) feature_counter = 0 for doc_id in docs: x = sp.csr_matrix(X_pool[doc_id], dtype=float) x_feats = x[0].indices for f in x_feats: if f == all_features[feature_counter]: x[0, f] = w_r * x[0, f] else: x[0, f] = w_o * x[0, f] feature_counter = feature_counter + 1 if not y_train: X_train = x else: X_train = sp.vstack((X_train, x)) y_train.append(y_pool[doc_id]) elif model_type == 'svm_linear_LwoR': w_r, w_o, C = optimalSVMLwoRParameters(X_pool[training_set], y_pool[training_set], all_features, seed) feature_counter = 0 for doc_id in docs: x = sp.csr_matrix(X_pool[doc_id], dtype=float) x_feats = x[0].indices for f in x_feats: if f == all_features[feature_counter]: x[0, f] = w_r * x[0, f] else: x[0, f] = w_o * x[0, f] feature_counter = feature_counter + 1 if not y_train: X_train = x else: X_train = sp.vstack((X_train, x)) y_train.append(y_pool[doc_id]) if model_type == 'poolingMNB': classpriors = np.zeros(2) classpriors[1] = (np.sum(y_pool[docs]) * 1.) / (len(docs) * 1.) classpriors[0] = 1. - classpriors[1] alpha, poolingFM_r, poolingMNBWeights = optimalPoolingMNBParameters( X_pool[training_set], y_pool[training_set], all_features, smoothing, num_feat) feature_model = FeatureMNBUniform(rationales_c0, rationales_c1, num_feat, smoothing, classpriors, poolingFM_r) feature_counter = 0 for doc_id in docs: if all_features[feature_counter]: # updates feature model with features one at a time feature_model.fit(all_features[feature_counter], y_pool[doc_id]) feature_counter = feature_counter + 1 x = sp.csr_matrix(X_pool[doc_id], dtype=float) if not y_train: X_train = x else: X_train = sp.vstack((X_train, x)) y_train.append(y_pool[doc_id]) if model_type == 'Zaidan': zaidan_C, zaidan_Ccontrast, zaidan_nu = optimalZaidanParameters( X_pool[training_set], y_pool[training_set], all_features, seed) feature_counter = 0 for doc_id in docs: x = sp.csr_matrix(X_pool[doc_id], dtype=np.float64) if all_features[feature_counter] is not None: x_pseudo = (X_pool[doc_id]).todense() # create pseudoinstances based on rationales provided; one pseudoinstance is created for each rationale. x_feats = x[0].indices for f in x_feats: if f == all_features[feature_counter]: test = x[0, f] x_pseudo[0, f] = x[0, f] / zaidan_nu else: x_pseudo[0, f] = 0.0 x_pseudo = sp.csr_matrix(x_pseudo, dtype=np.float64) if not y_train: X_train = x if all_features[feature_counter] is not None: X_train = sp.vstack((X_train, x_pseudo)) else: X_train = sp.vstack((X_train, x)) if all_features[feature_counter] is not None: X_train = sp.vstack((X_train, x_pseudo)) y_train.append(y_pool[doc_id]) if all_features[feature_counter] is not None: # append y label again for the pseudoinstance created y_train.append(y_pool[doc_id]) sample_weight.append(zaidan_C) if all_features[feature_counter] is not None: # append instance weight=zaidan_Ccontrast for the pseudoinstance created sample_weight.append(zaidan_Ccontrast) feature_counter = feature_counter + 1 # Train the model if model_type == 'lrl2': random_state = np.random.RandomState(seed=seed) model = LogisticRegression(C=lr_C, penalty='l2', random_state=random_state) elif model_type == 'lrl1': random_state = np.random.RandomState(seed=seed) model = LogisticRegression(C=lr_C, penalty='l1', random_state=random_state) elif model_type == 'mnb': model = MultinomialNB(alpha=alpha) elif model_type == 'mnb_LwoR': model = MultinomialNB(alpha=alpha) elif model_type == 'svm_linear_LwoR': random_state = np.random.RandomState(seed=seed) model = LinearSVC(C=C, random_state=random_state) elif model_type == 'svm_linear': random_state = np.random.RandomState(seed=seed) model = LinearSVC(C=C, random_state=random_state) elif model_type == 'poolingMNB': instance_model = MultinomialNB(alpha=alpha) model = PoolingMNB() elif model_type == 'Zaidan': random_state = np.random.RandomState(seed=seed) model = svm.SVC(kernel='linear', C=1.0, random_state=random_state) if model_type == 'poolingMNB': instance_model.fit(X_train, y_train) model.fit(instance_model, feature_model, weights=poolingMNBWeights) # train pooling_model elif model_type == 'Zaidan': model.fit(X_train, np.array(y_train), sample_weight=sample_weight) else: model.fit(X_train, np.array(y_train)) (accu, auc) = evaluate_model(model, X_test, y_test) model_scores['auc'].append(auc) model_scores['accu'].append(accu) if model_type == 'poolingMNB': model_scores['alpha'].append(alpha) model_scores['FMrvalue'].append(poolingFM_r) model_scores['IMweight'].append(poolingMNBWeights[0]) model_scores['FMweight'].append(poolingMNBWeights[1]) else: model_scores['FMrvalue'].append(0.0) model_scores['IMweight'].append(0.0) model_scores['FMweight'].append(0.0) if model_type == 'Zaidan': model_scores['zaidan_C'].append(zaidan_C) model_scores['zaidan_Ccontrast'].append(zaidan_Ccontrast) model_scores['zaidan_nu'].append(zaidan_nu) else: model_scores['zaidan_C'].append(0.0) model_scores['zaidan_Ccontrast'].append(0.0) model_scores['zaidan_nu'].append(0.0) if model_type == 'mnb' or model_type == 'mnb_LwoR': model_scores['alpha'].append(alpha) else: model_scores['alpha'].append(0.0) if model_type == 'svm_linear' or model_type == 'svm_linear_LwoR': model_scores['svm_C'].append(C) else: model_scores['svm_C'].append(0.0) if model_type == 'svm_linear' or model_type == 'svm_linear_LwoR' or model_type == 'mnb' or model_type == 'mnb_LwoR': model_scores['wr'].append(w_r) model_scores['wo'].append(w_o) else: model_scores['wr'].append(0.0) model_scores['wo'].append(0.0) num_training_samples.append(number_of_docs) feature_expert.rg.seed(seed) if selection_strategy == 'random': doc_pick_model = RandomStrategy(seed) elif selection_strategy == 'unc': doc_pick_model = UNCSampling() elif selection_strategy == "pnc": doc_pick_model = UNCPreferNoConflict() elif selection_strategy == "pnr": doc_pick_model = UNCPreferNoRationale() elif selection_strategy == "pr": doc_pick_model = UNCPreferRationale() elif selection_strategy == "pc": doc_pick_model = UNCPreferConflict() elif selection_strategy == "tt": doc_pick_model = UNCThreeTypes() elif selection_strategy == "pipe": doc_pick_model = Pipe([UNCSampling(), UNCPreferConflict()], [10, 30]) else: raise ValueError('Selection strategy: \'%s\' invalid!' % selection_strategy) k = step_size #while X_train.shape[0] < budget: while number_of_docs < budget: # Choose a document based on the strategy chosen if selection_strategy == "pnc": doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k, rationales_c0, rationales_c1, topk) elif selection_strategy == "pnr": doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k, rationales_c0, rationales_c1, topk) elif selection_strategy == "pr": doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k, rationales_c0, rationales_c1, topk) elif selection_strategy == "pc": doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k, rationales_c0, rationales_c1, topk) elif selection_strategy == "tt": doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k, rationales_c0, rationales_c1, topk) elif selection_strategy == "pipe": doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k, rationales_c0, rationales_c1, topk) else: doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k) if doc_ids is None or len(doc_ids) == 0: break for doc_id in doc_ids: #feature = feature_expert.most_informative_feature(X_pool[doc_id], y_pool[doc_id]) feature = feature_expert.any_informative_feature( X_pool[doc_id], y_pool[doc_id]) all_features.append(feature) number_of_docs = number_of_docs + 1 if feature is not None: rationales.add(feature) if y_pool[doc_id] == 0: rationales_c0.add(feature) else: rationales_c1.add(feature) # Remove the chosen document from pool and add it to the training set pool_set.remove(doc_id) training_set.append(long(doc_id)) if cvTrain: # get optimal parameters depending on the model_type X_train = None y_train = [] sample_weight = [] if model_type == 'mnb_LwoR': if np.mod(number_of_docs, 20) == 10: w_r, w_o = optimalMNBLwoRParameters( X_pool[training_set], y_pool[training_set], all_features) feature_counter = 0 for doc_id in training_set: x = sp.csr_matrix(X_pool[doc_id], dtype=float) x_feats = x[0].indices for f in x_feats: if f == all_features[feature_counter]: x[0, f] = w_r * x[0, f] else: x[0, f] = w_o * x[0, f] feature_counter = feature_counter + 1 if not y_train: X_train = x else: X_train = sp.vstack((X_train, x)) y_train.append(y_pool[doc_id]) elif model_type == 'mnb': if np.mod(number_of_docs, 20) == 10: w_r, w_o = optimalMNBParameters(X_pool[training_set], y_pool[training_set], all_features) feature_counter = 0 for doc_id in training_set: x = sp.csr_matrix(X_pool[doc_id], dtype=float) x_feats = x[0].indices for f in x_feats: if f == all_features[feature_counter]: x[0, f] = w_r * x[0, f] else: x[0, f] = w_o * x[0, f] feature_counter = feature_counter + 1 if not y_train: X_train = x else: X_train = sp.vstack((X_train, x)) y_train.append(y_pool[doc_id]) elif model_type == 'svm_linear_LwoR': if np.mod(number_of_docs, 20) == 10: w_r, w_o, C = optimalSVMLwoRParameters( X_pool[training_set], y_pool[training_set], all_features, seed) feature_counter = 0 for doc_id in training_set: x = sp.csr_matrix(X_pool[doc_id], dtype=np.float64) x_feats = x[0].indices for f in x_feats: if f == all_features[feature_counter]: x[0, f] = w_r * x[0, f] else: x[0, f] = w_o * x[0, f] feature_counter = feature_counter + 1 if not y_train: X_train = x else: X_train = sp.vstack((X_train, x)) y_train.append(y_pool[doc_id]) elif model_type == 'svm_linear': if np.mod(number_of_docs, 20) == 10: w_r, w_o, C = optimalSVMParameters(X_pool[training_set], y_pool[training_set], all_features, seed) feature_counter = 0 for doc_id in training_set: x = sp.csr_matrix(X_pool[doc_id], dtype=np.float64) x_feats = x[0].indices for f in x_feats: if f == all_features[feature_counter]: x[0, f] = w_r * x[0, f] else: x[0, f] = w_o * x[0, f] feature_counter = feature_counter + 1 if not y_train: X_train = x else: X_train = sp.vstack((X_train, x)) y_train.append(y_pool[doc_id]) if model_type == 'poolingMNB': classpriors = np.zeros(2) classpriors[1] = (np.sum(y_pool[docs]) * 1.) / (len(docs) * 1.) classpriors[0] = 1. - classpriors[1] if np.mod(number_of_docs, 20) == 10: alpha, poolingFM_r, poolingMNBWeights = optimalPoolingMNBParameters( X_pool[training_set], y_pool[training_set], all_features, smoothing, num_feat) feature_model = FeatureMNBUniform(rationales_c0, rationales_c1, num_feat, smoothing, classpriors, poolingFM_r) feature_counter = 0 for doc_id in training_set: if all_features[feature_counter]: # updates feature model with features one at a time feature_model.fit(all_features[feature_counter], y_pool[doc_id]) feature_counter = feature_counter + 1 x = sp.csr_matrix(X_pool[doc_id], dtype=float) if not y_train: X_train = x else: X_train = sp.vstack((X_train, x)) y_train.append(y_pool[doc_id]) if model_type == 'Zaidan': if np.mod(number_of_docs, 20) == 10: zaidan_C, zaidan_Ccontrast, zaidan_nu = optimalZaidanParameters( X_pool[training_set], y_pool[training_set], all_features, seed) feature_counter = 0 for doc_id in training_set: x = sp.csr_matrix(X_pool[doc_id], dtype=np.float64) if all_features[feature_counter] is not None: x_pseudo = (X_pool[doc_id]).todense() # create pseudoinstances based on rationales provided; one pseudoinstance is created for each rationale. x_feats = x[0].indices for f in x_feats: if f == all_features[feature_counter]: test = x[0, f] x_pseudo[0, f] = x[0, f] / zaidan_nu else: x_pseudo[0, f] = 0.0 x_pseudo = sp.csr_matrix(x_pseudo, dtype=np.float64) if not y_train: X_train = x if all_features[feature_counter] is not None: X_train = sp.vstack((X_train, x_pseudo)) else: X_train = sp.vstack((X_train, x)) if all_features[feature_counter] is not None: X_train = sp.vstack((X_train, x_pseudo)) y_train.append(y_pool[doc_id]) if all_features[feature_counter] is not None: # append y label again for the pseudoinstance created y_train.append(y_pool[doc_id]) sample_weight.append(zaidan_C) if all_features[feature_counter] is not None: # append instance weight=zaidan_Ccontrast for the pseudoinstance created sample_weight.append(zaidan_Ccontrast) feature_counter = feature_counter + 1 # Train the model if model_type == 'lrl2': random_state = np.random.RandomState(seed=seed) model = LogisticRegression(C=lr_C, penalty='l2', random_state=random_state) elif model_type == 'lrl1': random_state = np.random.RandomState(seed=seed) model = LogisticRegression(C=lr_C, penalty='l1', random_state=random_state) elif model_type == 'mnb': model = MultinomialNB(alpha=alpha) elif model_type == 'mnb_LwoR': model = MultinomialNB(alpha=alpha) elif model_type == 'svm_linear_LwoR': random_state = np.random.RandomState(seed=seed) model = LinearSVC(C=C, random_state=random_state) elif model_type == 'svm_linear': random_state = np.random.RandomState(seed=seed) model = LinearSVC(C=C, random_state=random_state) elif model_type == 'poolingMNB': instance_model = MultinomialNB(alpha=alpha) model = PoolingMNB() elif model_type == 'Zaidan': random_state = np.random.RandomState(seed=seed) model = svm.SVC(kernel='linear', C=svm_C, random_state=random_state) if model_type == 'poolingMNB': instance_model.fit(X_train, y_train) model.fit(instance_model, feature_model, weights=poolingMNBWeights) # train pooling_model elif model_type == 'Zaidan': model.fit(X_train, np.array(y_train), sample_weight=sample_weight) else: model.fit(X_train, np.array(y_train)) (accu, auc) = evaluate_model(model, X_test, y_test) model_scores['auc'].append(auc) model_scores['accu'].append(accu) if model_type == 'poolingMNB': model_scores['alpha'].append(alpha) model_scores['FMrvalue'].append(poolingFM_r) model_scores['IMweight'].append(poolingMNBWeights[0]) model_scores['FMweight'].append(poolingMNBWeights[1]) else: model_scores['FMrvalue'].append(0.0) model_scores['IMweight'].append(0.0) model_scores['FMweight'].append(0.0) if model_type == 'Zaidan': model_scores['zaidan_C'].append(zaidan_C) model_scores['zaidan_Ccontrast'].append(zaidan_Ccontrast) model_scores['zaidan_nu'].append(zaidan_nu) else: model_scores['zaidan_C'].append(0.0) model_scores['zaidan_Ccontrast'].append(0.0) model_scores['zaidan_nu'].append(0.0) if model_type == 'mnb' or model_type == 'mnb_LwoR': model_scores['alpha'].append(alpha) else: model_scores['alpha'].append(0.0) if model_type == 'svm_linear' or model_type == 'svm_linear_LwoR': model_scores['svm_C'].append(C) else: model_scores['svm_C'].append(0.0) if model_type == 'svm_linear' or model_type == 'svm_linear_LwoR' or model_type == 'mnb' or model_type == 'mnb_LwoR': model_scores['wr'].append(w_r) model_scores['wo'].append(w_o) else: model_scores['wr'].append(0.0) model_scores['wo'].append(0.0) num_training_samples.append(number_of_docs) print 'Active Learning took %2.2fs' % (time() - start) return (np.array(num_training_samples), model_scores)
def learn(model_type, X_pool, y_pool, X_test, y_test, training_set, pool_set, feature_expert, \ selection_strategy, budget, step_size, topk, w_o, w_r, seed=0, alpha=1, poolingMNBWeights=[0.5, 0.5], Meville_etal_r=100.0, lr_C=1, svm_C=1, \ Zaidan_etal_C=1, Zaidan_etal_Ccontrast=1, Zaidan_etal_mu=1, Debug=False): start = time() print('-' * 50) print('Starting Active Learning...') _, num_feat = X_pool.shape model_scores = {'auc': [], 'accu': []} rationales = set() rationales_c0 = set() rationales_c1 = set() feature_expert.rg.seed(seed) num_training_samples = [] number_of_docs = 0 docs = training_set X_train = None y_train = [] sample_weight = list() if model_type == 'Melville_etal': # create feature model classpriors = np.zeros(2) classpriors[1] = (np.sum(y_pool[docs]) * 1.) / (len(docs) * 1.) classpriors[0] = 1. - classpriors[1] feature_model = FeatureMNBUniform(rationales_c0, rationales_c1, num_feat, classpriors, Meville_etal_r) for doc_id in docs: number_of_docs = number_of_docs + 1 feature = feature_expert.any_informative_feature( X_pool[doc_id], y_pool[doc_id]) if model_type == 'Melville_etal': if feature: feature_model.fit(feature, y_pool[doc_id]) rationales.add(feature) if y_pool[doc_id] == 0: rationales_c0.add(feature) else: rationales_c1.add(feature) if model_type == 'Zaidan_etal': x = sp.csr_matrix(X_pool[doc_id], dtype=np.float64) if feature is not None: x_pseudo = (X_pool[doc_id]).todense() # create pseudoinstances based on rationales provided; one pseudoinstance is created per rationale. x_feats = x[0].indices for f in x_feats: if f == feature: test = x[0, f] x_pseudo[0, f] = x[0, f] / Zaidan_etal_mu else: x_pseudo[0, f] = 0.0 x_pseudo = sp.csr_matrix(x_pseudo, dtype=np.float64) else: x = sp.csr_matrix(X_pool[doc_id], dtype=float) if "Melville_etal" not in model_type: x_feats = x[0].indices for f in x_feats: if f == feature: x[0, f] = w_r * x[0, f] else: x[0, f] = w_o * x[0, f] if model_type == 'Zaidan_etal': if not y_train: X_train = x if feature is not None: X_train = sp.vstack((X_train, x_pseudo)) else: X_train = sp.vstack((X_train, x)) if feature is not None: X_train = sp.vstack((X_train, x_pseudo)) y_train.append(y_pool[doc_id]) if feature is not None: # append y label again for the pseudoinstance created y_train.append(y_pool[doc_id]) sample_weight.append(Zaidan_etal_C) if feature is not None: # append instance weight=Zaidan_etal_Ccontrast for the pseudoinstance created sample_weight.append(Zaidan_etal_Ccontrast) else: if not y_train: X_train = x else: X_train = sp.vstack((X_train, x)) y_train.append(y_pool[doc_id]) # Train the model if model_type == 'lrl2': random_state = RandomState(seed=seed) model = LogisticRegression(C=lr_C, penalty='l2', random_state=random_state) elif model_type == 'lrl1': random_state = RandomState(seed=seed) model = LogisticRegression(C=lr_C, penalty='l1', random_state=random_state) elif model_type == 'mnb': model = MultinomialNB(alpha=alpha) elif model_type == 'svm_linear': random_state = RandomState(seed=seed) model = LinearSVC(C=svm_C, random_state=random_state) elif model_type == 'Melville_etal': instance_model = MultinomialNB(alpha=alpha) model = PoolingMNB() elif model_type == 'Zaidan_etal': random_state = RandomState(seed=seed) model = svm.SVC(kernel='linear', C=svm_C, random_state=random_state) if model_type == 'Melville_etal': #feature_model.fit(feature, y_pool[doc_id]) instance_model.fit(X_train, y_train) model.fit(instance_model, feature_model, weights=poolingMNBWeights) # train pooling_model elif model_type == 'Zaidan_etal': model.fit(X_train, np.array(y_train), sample_weight=sample_weight) else: model.fit(X_train, np.array(y_train)) (accu, auc) = evaluate_model(model, X_test, y_test) model_scores['auc'].append(auc) model_scores['accu'].append(accu) num_training_samples.append(number_of_docs) feature_expert.rg.seed(seed) if selection_strategy == 'RND': doc_pick_model = RandomStrategy(seed) elif selection_strategy == 'UNC': doc_pick_model = UNCSampling() elif selection_strategy == 'UNC_PNC': doc_pick_model = UNCPreferNoConflict() elif selection_strategy == 'UNC_PC': doc_pick_model = UNCPreferConflict() else: raise ValueError('Selection strategy: \'%s\' invalid!' % selection_strategy) k = step_size while X_train.shape[0] < budget: # Choose a document based on the strategy chosen if selection_strategy == 'UNC_PNC': doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k, rationales_c0, rationales_c1, topk) elif selection_strategy == 'UNC_PC': doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k, rationales_c0, rationales_c1, topk) else: doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k) if doc_ids is None or len(doc_ids) == 0: break for doc_id in doc_ids: # Remove the chosen document from pool and add it to the training set pool_set.remove(doc_id) training_set.append(doc_id) #feature = feature_expert.most_informative_feature(X_pool[doc_id], y_pool[doc_id]) feature = feature_expert.any_informative_feature( X_pool[doc_id], y_pool[doc_id]) if model_type == 'Melville_etal': if feature: feature_model.fit(feature, y_pool[doc_id]) number_of_docs = number_of_docs + 1 rationales.add(feature) if y_pool[doc_id] == 0: rationales_c0.add(feature) else: rationales_c1.add(feature) if model_type == 'Zaidan_etal': x = sp.csr_matrix(X_pool[doc_id], dtype=np.float64) if feature is not None: x_pseudo = (X_pool[doc_id]).todense() # create pseudoinstances based on rationales provided; one pseudoinstance is created for each rationale. x_feats = x[0].indices for f in x_feats: if f == feature: test = x[0, f] x_pseudo[0, f] = x[0, f] / Zaidan_etal_mu else: x_pseudo[0, f] = 0.0 x_pseudo = sp.csr_matrix(x_pseudo, dtype=np.float64) else: x = sp.csr_matrix(X_pool[doc_id], dtype=float) if "Melville_etal" not in model_type: x_feats = x[0].indices for f in x_feats: if f == feature: x[0, f] = w_r * x[0, f] else: x[0, f] = w_o * x[0, f] if model_type == 'Zaidan_etal': X_train = sp.vstack((X_train, x)) if feature is not None: X_train = sp.vstack((X_train, x_pseudo)) y_train.append(y_pool[doc_id]) if feature is not None: # append y label again for the pseudoinstance created y_train.append(y_pool[doc_id]) sample_weight.append(Zaidan_etal_C) if feature is not None: # append instance weight=Zaidan_etal_Ccontrast for the pseudoinstance created sample_weight.append(Zaidan_etal_Ccontrast) else: X_train = sp.vstack((X_train, x)) y_train.append(y_pool[doc_id]) # Train the model if model_type == 'lrl2': random_state = RandomState(seed=seed) model = LogisticRegression(C=lr_C, penalty='l2', random_state=random_state) elif model_type == 'lrl1': random_state = RandomState(seed=seed) model = LogisticRegression(C=lr_C, penalty='l1', random_state=random_state) elif model_type == 'mnb': model = MultinomialNB(alpha=alpha) elif model_type == 'svm_linear': random_state = RandomState(seed=seed) model = LinearSVC(C=svm_C, random_state=random_state) elif model_type == 'Melville_etal': instance_model = MultinomialNB(alpha=alpha) model = PoolingMNB() elif model_type == 'Zaidan_etal': random_state = RandomState(seed=seed) model = svm.SVC(kernel='linear', C=svm_C, random_state=random_state) if model_type == 'Melville_etal': instance_model.fit(X_train, y_train) model.fit(instance_model, feature_model, weights=poolingMNBWeights) # train pooling_model elif model_type == 'Zaidan_etal': model.fit(X_train, np.array(y_train), sample_weight=sample_weight) else: model.fit(X_train, np.array(y_train)) (accu, auc) = evaluate_model(model, X_test, y_test) model_scores['auc'].append(auc) model_scores['accu'].append(accu) num_training_samples.append(number_of_docs) print('Active Learning took %2.2fs' % (time() - start)) return (np.array(num_training_samples), model_scores)
top_n = 20 print '\n' print '=' * 50 for doc in doc_ids: print_all_features(feature_names, fe, top_n, doc, X_pool, y_pool, X_pool_docs) print '=' * 50 ch = raw_input('Display the next document? Press Enter to continue or type \'n\' to exit... ') if ch == 'n': break t0 = time() print '-' * 50 print 'Starting to train feature_model(MNB)...' feature_model = FeatureMNBUniform([], [], num_feat=X_pool.shape[1], smoothing=1e-6, class_prior = [0.5, 0.5], r=100.) for doc in range(X_pool.shape[0]): feature = fe.most_informative_feature(X_pool[doc], y_pool[doc]) if feature: feature_model.fit(feature, y_pool[doc]) # train feature_model one by one print 'Feature Model(MNB): accu = %f, auc = %f' % evaluate_model(feature_model, X_test, y_test) print 'Training feature_model(MNB) took %5.2f' % (time() - t0) logit = linear_model.LogisticRegression(C=args.c, penalty='l1') logit.fit(X_pool, y_pool) print 'Feature Model(LogisticRegression): accu = %f, auc = %f' % evaluate_model(logit, X_test, y_test)
def choice(self, X, y, pool, train_indices, current_feature_model, current_reasoning_model, w_n, w_a): rand_indices = self.rgen.permutation(len(pool)) candidates = [pool[i] for i in rand_indices[:self.sub_pool]] aucs = [] for doc in candidates: new_train_indices = list(train_indices) new_train_indices.append(doc) # train an instance model instance_model = MultinomialNB(alpha=1.) instance_model.fit(X[new_train_indices], y[new_train_indices]) # train a feature model feature_model = None if isinstance(current_feature_model, FeatureMNBUniform): feature_model = FeatureMNBUniform( current_feature_model.class0_feats, current_feature_model.class1_feats, self.feature_expert.num_features, 0) elif isinstance(current_feature_model, FeatureMNBWeighted): feature_model = FeatureMNBWeighted( num_feat=self.feature_expert.num_features, feat_count=current_feature_model.feature_count_, imaginary_counts=current_feature_model.imaginary_counts) else: raise ValueError('Feature model type: \'%s\' unknown!' % current_feature_model.__class__.__name__) top_feat = self.feature_expert.most_informative_feature( X[doc], y[doc]) if top_feat: feature_model.fit( top_feat, y[doc] ) # fit also calls update; so there is no need to update again else: feature_model.update() # make a deep copy of the reasoning model and partial train it reasoning_model = copy.deepcopy(current_reasoning_model) reasoning_model.partial_fit(X[doc], y[doc], top_feat, w_n, w_a) # pooling model pooling_model = PoolingMNB() pooling_model.fit(instance_model, feature_model, weights=[0.5, 0.5]) # evaluate opt_model = None if self.optimize == "P": opt_model = pooling_model elif self.optimize == "I": opt_model = instance_model elif self.optimize == "F": opt_model = feature_model elif self.optimize == "R": opt_model = reasoning_model else: raise ValueError('Optimization Model: \'%s\' invalid!' % self.optimize) y_probas = opt_model.predict_proba(self.X_test) auc = metrics.roc_auc_score(self.y_test, y_probas[:, 1]) aucs.append(auc) doc_id = candidates[np.argsort(aucs)[-1]] return doc_id
def learn(model_type, X_pool, y_pool, X_test, y_test, training_set, pool_set, feature_expert, \ selection_strategy, budget, step_size, topk, w_o, w_r, seed=0, alpha=1, poolingMNBWeights=[0.5, 0.5], Meville_etal_r=100.0, lr_C=1, svm_C=1, \ Zaidan_etal_C=1, Zaidan_etal_Ccontrast=1, Zaidan_etal_mu=1, Debug=False): start = time() print '-' * 50 print 'Starting Active Learning...' _, num_feat = X_pool.shape model_scores = {'auc':[], 'accu':[]} rationales = set() rationales_c0 = set() rationales_c1 = set() feature_expert.rg.seed(seed) num_training_samples = [] number_of_docs = 0 docs = training_set X_train = None y_train = [] if model_type=='Melville_etal': # create feature model classpriors=np.zeros(2) classpriors[1] = (np.sum(y_pool[docs])*1.)/(len(docs)*1.) classpriors[0] = 1. - classpriors[1] feature_model = FeatureMNBUniform(rationales_c0, rationales_c1, num_feat, classpriors, Meville_etal_r) for doc_id in docs: number_of_docs=number_of_docs+1 feature = feature_expert.any_informative_feature(X_pool[doc_id], y_pool[doc_id]) if model_type == 'Melville_etal': if feature: feature_model.fit(feature, y_pool[doc_id]) rationales.add(feature) if y_pool[doc_id] == 0: rationales_c0.add(feature) else: rationales_c1.add(feature) if model_type == 'Zaidan_etal': x = sp.csr_matrix(X_pool[doc_id], dtype=np.float64) if feature is not None: x_pseudo = (X_pool[doc_id]).todense() # create pseudoinstances based on rationales provided; one pseudoinstance is created per rationale. x_feats = x[0].indices for f in x_feats: if f == feature: test= x[0,f] x_pseudo[0,f] = x[0,f]/Zaidan_etal_mu else: x_pseudo[0,f] = 0.0 x_pseudo=sp.csr_matrix(x_pseudo, dtype=np.float64) else: x = sp.csr_matrix(X_pool[doc_id], dtype=float) if "Melville_etal" not in model_type: x_feats = x[0].indices for f in x_feats: if f == feature: x[0,f] = w_r*x[0,f] else: x[0,f] = w_o*x[0,f] if model_type=='Zaidan_etal': if not y_train: X_train = x if feature is not None: X_train = sp.vstack((X_train, x_pseudo)) else: X_train = sp.vstack((X_train, x)) if feature is not None: X_train = sp.vstack((X_train, x_pseudo)) y_train.append(y_pool[doc_id]) if feature is not None: # append y label again for the pseudoinstance created y_train.append(y_pool[doc_id]) sample_weight.append(Zaidan_etal_C) if feature is not None: # append instance weight=Zaidan_etal_Ccontrast for the pseudoinstance created sample_weight.append(Zaidan_etal_Ccontrast) else: if not y_train: X_train = x else: X_train = sp.vstack((X_train, x)) y_train.append(y_pool[doc_id]) # Train the model if model_type=='lrl2': random_state = np.random.RandomState(seed=seed) model = LogisticRegression(C=lr_C, penalty='l2', random_state=random_state) elif model_type=='lrl1': random_state = np.random.RandomState(seed=seed) model = LogisticRegression(C=lr_C, penalty='l1', random_state=random_state) elif model_type=='mnb': model = MultinomialNB(alpha=alpha) elif model_type=='svm_linear': random_state = np.random.RandomState(seed=seed) model = LinearSVC(C=svm_C, random_state=random_state) elif model_type=='Melville_etal': instance_model=MultinomialNB(alpha=alpha) model = PoolingMNB() elif model_type=='Zaidan_etal': random_state = np.random.RandomState(seed=seed) model = svm.SVC(kernel='linear', C=svm_C, random_state=random_state) if model_type=='Melville_etal': #feature_model.fit(feature, y_pool[doc_id]) instance_model.fit(X_train, y_train) model.fit(instance_model, feature_model, weights=poolingMNBWeights) # train pooling_model elif model_type=='Zaidan_etal': model.fit(X_train, np.array(y_train), sample_weight=sample_weight) else: model.fit(X_train, np.array(y_train)) (accu, auc) = evaluate_model(model, X_test, y_test) model_scores['auc'].append(auc) model_scores['accu'].append(accu) num_training_samples.append(number_of_docs) feature_expert.rg.seed(seed) if selection_strategy == 'RND': doc_pick_model = RandomStrategy(seed) elif selection_strategy == 'UNC': doc_pick_model = UNCSampling() elif selection_strategy == 'UNC_PNC': doc_pick_model = UNCPreferNoConflict() elif selection_strategy == 'UNC_PC': doc_pick_model = UNCPreferConflict() else: raise ValueError('Selection strategy: \'%s\' invalid!' % selection_strategy) k = step_size while X_train.shape[0] < budget: # Choose a document based on the strategy chosen if selection_strategy == 'UNC_PNC': doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k, rationales_c0, rationales_c1, topk) elif selection_strategy == 'UNC_PC': doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k, rationales_c0, rationales_c1, topk) else: doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k) if doc_ids is None or len(doc_ids) == 0: break for doc_id in doc_ids: # Remove the chosen document from pool and add it to the training set pool_set.remove(doc_id) training_set.append(doc_id) #feature = feature_expert.most_informative_feature(X_pool[doc_id], y_pool[doc_id]) feature = feature_expert.any_informative_feature(X_pool[doc_id], y_pool[doc_id]) if model_type=='Melville_etal': if feature: feature_model.fit(feature, y_pool[doc_id]) number_of_docs=number_of_docs+1 rationales.add(feature) if y_pool[doc_id] == 0: rationales_c0.add(feature) else: rationales_c1.add(feature) if model_type=='Zaidan_etal': x = sp.csr_matrix(X_pool[doc_id], dtype=np.float64) if feature is not None: x_pseudo = (X_pool[doc_id]).todense() # create pseudoinstances based on rationales provided; one pseudoinstance is created for each rationale. x_feats = x[0].indices for f in x_feats: if f == feature: test= x[0,f] x_pseudo[0,f] = x[0,f]/Zaidan_etal_mu else: x_pseudo[0,f] = 0.0 x_pseudo=sp.csr_matrix(x_pseudo, dtype=np.float64) else: x = sp.csr_matrix(X_pool[doc_id], dtype=float) if "Melville_etal" not in model_type: x_feats = x[0].indices for f in x_feats: if f == feature: x[0,f] = w_r*x[0,f] else: x[0,f] = w_o*x[0,f] if model_type=='Zaidan_etal': X_train = sp.vstack((X_train, x)) if feature is not None: X_train = sp.vstack((X_train, x_pseudo)) y_train.append(y_pool[doc_id]) if feature is not None: # append y label again for the pseudoinstance created y_train.append(y_pool[doc_id]) sample_weight.append(Zaidan_etal_C) if feature is not None: # append instance weight=Zaidan_etal_Ccontrast for the pseudoinstance created sample_weight.append(Zaidan_etal_Ccontrast) else: X_train = sp.vstack((X_train, x)) y_train.append(y_pool[doc_id]) # Train the model if model_type=='lrl2': random_state = np.random.RandomState(seed=seed) model = LogisticRegression(C=lr_C, penalty='l2', random_state=random_state) elif model_type=='lrl1': random_state = np.random.RandomState(seed=seed) model = LogisticRegression(C=lr_C, penalty='l1', random_state=random_state) elif model_type=='mnb': model = MultinomialNB(alpha=alpha) elif model_type=='svm_linear': random_state = np.random.RandomState(seed=seed) model = LinearSVC(C=svm_C, random_state=random_state) elif model_type=='Melville_etal': instance_model=MultinomialNB(alpha=alpha) model = PoolingMNB() elif model_type=='Zaidan_etal': random_state = np.random.RandomState(seed=seed) model = svm.SVC(kernel='linear', C=svm_C, random_state=random_state) if model_type=='Melville_etal': instance_model.fit(X_train, y_train) model.fit(instance_model, feature_model, weights=poolingMNBWeights) # train pooling_model elif model_type=='Zaidan_etal': model.fit(X_train, np.array(y_train), sample_weight=sample_weight) else: model.fit(X_train, np.array(y_train)) (accu, auc) = evaluate_model(model, X_test, y_test) model_scores['auc'].append(auc) model_scores['accu'].append(accu) num_training_samples.append(number_of_docs) print 'Active Learning took %2.2fs' % (time() - start) return (np.array(num_training_samples), model_scores)
X_pool_docs) print '=' * 50 ch = raw_input( 'Display the next document? Press Enter to continue or type \'n\' to exit... ' ) if ch == 'n': break t0 = time() print '-' * 50 print 'Starting to train feature_model(MNB)...' feature_model = FeatureMNBUniform([], [], num_feat=X_pool.shape[1], smoothing=1e-6, class_prior=[0.5, 0.5], r=100.) for doc in range(X_pool.shape[0]): feature = fe.most_informative_feature(X_pool[doc], y_pool[doc]) if feature: feature_model.fit(feature, y_pool[doc]) # train feature_model one by one print 'Feature Model(MNB): accu = %f, auc = %f' % evaluate_model( feature_model, X_test, y_test) print 'Training feature_model(MNB) took %5.2f' % (time() - t0) logit = linear_model.LogisticRegression(C=args.c, penalty='l1') logit.fit(X_pool, y_pool)
(X_pool, y_pool, X_test, y_test, feat_names) = load_dataset(args.dataset) models = {'MultinomialNB(alpha=1)':MultinomialNB(alpha=1), \ 'LogisticRegression(C=1, penalty=\'l1\')':LogisticRegression(C=1, penalty='l1'), \ 'LogisticRegression(C=0.1, penalty=\'l1\')':LogisticRegression(C=0.1, penalty='l1')} aucs = {} for mk in models.keys(): models[mk].fit(X_pool, y_pool) _, auc = evaluate_model(models[mk], X_test, y_test) aucs[mk] = auc fe = feature_expert(X_pool, y_pool, metric="L1", C=args.c) all_feature_model = FeatureMNBUniform(fe.feature_rank[0], fe.feature_rank[1], fe.num_features, smoothing=args.smoothing) all_feature_model.update() _, all_auc = evaluate_model(all_feature_model, X_test, y_test) k_feature_model = FeatureMNBUniform(fe.feature_rank[0][:args.k], fe.feature_rank[1][:args.k], fe.num_features, smoothing=args.smoothing) k_feature_model.update() _, k_auc = evaluate_model(k_feature_model, X_test, y_test) print '-' * 50
def learn(model_type, X_pool, y_pool, X_test, y_test, training_set, pool_set, feature_expert, \ selection_strategy, budget, step_size, topk, w_o, w_r, seed=0, alpha=1, smoothing=0, poolingMNBWeights=[0.5, 0.5], poolingFM_r=100.0, lr_C=1, svm_C=1, \ zaidan_C=1, zaidan_Ccontrast=1, zaidan_nu=1, cvTrain=False, Debug=False): start = time() print '-' * 50 print 'Starting Active Learning...' _, num_feat = X_pool.shape model_scores = {'auc':[], 'accu':[], 'wr':[], 'wo':[], 'alpha':[], 'svm_C':[], 'zaidan_C':[], 'zaidan_Ccontrast':[], 'zaidan_nu':[], 'FMrvalue':[], 'IMweight':[], 'FMweight':[]} rationales = set() rationales_c0 = set() rationales_c1 = set() number_of_docs = 0 feature_expert.rg.seed(seed) num_training_samples = [] all_features=[] # keep all the training data instance ids in docs list docs = training_set X_train = None y_train = [] sample_weight = [] for doc_id in docs: #feature = feature_expert.most_informative_feature(X_pool[doc_id], y_pool[doc_id]) feature = feature_expert.any_informative_feature(X_pool[doc_id], y_pool[doc_id]) number_of_docs=number_of_docs+1 # append feature to all_features, even if it is None all_features.append(feature) if feature is not None: rationales.add(feature) if y_pool[doc_id] == 0: rationales_c0.add(feature) else: rationales_c1.add(feature) if cvTrain: # get optimal parameters depending on the model_type if model_type=='mnb_LwoR': w_r, w_o=optimalMNBLwoRParameters(X_pool[training_set], y_pool[training_set], all_features) feature_counter=0 for doc_id in docs: x = sp.csr_matrix(X_pool[doc_id], dtype=float) x_feats = x[0].indices for f in x_feats: if f == all_features[feature_counter]: x[0,f] = w_r*x[0,f] else: x[0,f] = w_o*x[0,f] feature_counter=feature_counter+1 if not y_train: X_train = x else: X_train = sp.vstack((X_train, x)) y_train.append(y_pool[doc_id]) elif model_type=='mnb': w_r, w_o=optimalMNBParameters(X_pool[training_set], y_pool[training_set], all_features) feature_counter=0 for doc_id in docs: x = sp.csr_matrix(X_pool[doc_id], dtype=float) x_feats = x[0].indices for f in x_feats: if f == all_features[feature_counter]: x[0,f] = w_r*x[0,f] else: x[0,f] = w_o*x[0,f] feature_counter=feature_counter+1 if not y_train: X_train = x else: X_train = sp.vstack((X_train, x)) y_train.append(y_pool[doc_id]) elif model_type=='svm_linear': w_r, w_o, C= optimalSVMParameters(X_pool[training_set], y_pool[training_set], all_features, seed) feature_counter=0 for doc_id in docs: x = sp.csr_matrix(X_pool[doc_id], dtype=float) x_feats = x[0].indices for f in x_feats: if f == all_features[feature_counter]: x[0,f] = w_r*x[0,f] else: x[0,f] = w_o*x[0,f] feature_counter=feature_counter+1 if not y_train: X_train = x else: X_train = sp.vstack((X_train, x)) y_train.append(y_pool[doc_id]) elif model_type=='svm_linear_LwoR': w_r, w_o, C= optimalSVMLwoRParameters(X_pool[training_set], y_pool[training_set], all_features, seed) feature_counter=0 for doc_id in docs: x = sp.csr_matrix(X_pool[doc_id], dtype=float) x_feats = x[0].indices for f in x_feats: if f == all_features[feature_counter]: x[0,f] = w_r*x[0,f] else: x[0,f] = w_o*x[0,f] feature_counter=feature_counter+1 if not y_train: X_train = x else: X_train = sp.vstack((X_train, x)) y_train.append(y_pool[doc_id]) if model_type=='poolingMNB': classpriors=np.zeros(2) classpriors[1]=(np.sum(y_pool[docs])*1.)/(len(docs)*1.) classpriors[0]= 1. - classpriors[1] alpha, poolingFM_r, poolingMNBWeights = optimalPoolingMNBParameters(X_pool[training_set], y_pool[training_set], all_features, smoothing, num_feat) feature_model=FeatureMNBUniform(rationales_c0, rationales_c1, num_feat, smoothing, classpriors, poolingFM_r) feature_counter=0 for doc_id in docs: if all_features[feature_counter]: # updates feature model with features one at a time feature_model.fit(all_features[feature_counter], y_pool[doc_id]) feature_counter=feature_counter+1 x = sp.csr_matrix(X_pool[doc_id], dtype=float) if not y_train: X_train = x else: X_train = sp.vstack((X_train, x)) y_train.append(y_pool[doc_id]) if model_type=='Zaidan': zaidan_C, zaidan_Ccontrast, zaidan_nu = optimalZaidanParameters(X_pool[training_set], y_pool[training_set], all_features, seed) feature_counter=0 for doc_id in docs: x = sp.csr_matrix(X_pool[doc_id], dtype=np.float64) if all_features[feature_counter] is not None: x_pseudo = (X_pool[doc_id]).todense() # create pseudoinstances based on rationales provided; one pseudoinstance is created for each rationale. x_feats = x[0].indices for f in x_feats: if f == all_features[feature_counter]: test= x[0,f] x_pseudo[0,f] = x[0,f]/zaidan_nu else: x_pseudo[0,f] = 0.0 x_pseudo=sp.csr_matrix(x_pseudo, dtype=np.float64) if not y_train: X_train = x if all_features[feature_counter] is not None: X_train = sp.vstack((X_train, x_pseudo)) else: X_train = sp.vstack((X_train, x)) if all_features[feature_counter] is not None: X_train = sp.vstack((X_train, x_pseudo)) y_train.append(y_pool[doc_id]) if all_features[feature_counter] is not None: # append y label again for the pseudoinstance created y_train.append(y_pool[doc_id]) sample_weight.append(zaidan_C) if all_features[feature_counter] is not None: # append instance weight=zaidan_Ccontrast for the pseudoinstance created sample_weight.append(zaidan_Ccontrast) feature_counter = feature_counter+1 # Train the model if model_type=='lrl2': random_state = np.random.RandomState(seed=seed) model = LogisticRegression(C=lr_C, penalty='l2', random_state=random_state) elif model_type=='lrl1': random_state = np.random.RandomState(seed=seed) model = LogisticRegression(C=lr_C, penalty='l1', random_state=random_state) elif model_type=='mnb': model = MultinomialNB(alpha=alpha) elif model_type=='mnb_LwoR': model = MultinomialNB(alpha=alpha) elif model_type=='svm_linear_LwoR': random_state = np.random.RandomState(seed=seed) model = LinearSVC(C=C, random_state=random_state) elif model_type=='svm_linear': random_state = np.random.RandomState(seed=seed) model = LinearSVC(C=C, random_state=random_state) elif model_type=='poolingMNB': instance_model=MultinomialNB(alpha=alpha) model = PoolingMNB() elif model_type=='Zaidan': random_state = np.random.RandomState(seed=seed) model = svm.SVC(kernel='linear', C=1.0, random_state=random_state) if model_type=='poolingMNB': instance_model.fit(X_train, y_train) model.fit(instance_model, feature_model, weights=poolingMNBWeights) # train pooling_model elif model_type=='Zaidan': model.fit(X_train, np.array(y_train), sample_weight=sample_weight) else: model.fit(X_train, np.array(y_train)) (accu, auc) = evaluate_model(model, X_test, y_test) model_scores['auc'].append(auc) model_scores['accu'].append(accu) if model_type=='poolingMNB': model_scores['alpha'].append(alpha) model_scores['FMrvalue'].append(poolingFM_r) model_scores['IMweight'].append(poolingMNBWeights[0]) model_scores['FMweight'].append(poolingMNBWeights[1]) else: model_scores['FMrvalue'].append(0.0) model_scores['IMweight'].append(0.0) model_scores['FMweight'].append(0.0) if model_type=='Zaidan': model_scores['zaidan_C'].append(zaidan_C) model_scores['zaidan_Ccontrast'].append(zaidan_Ccontrast) model_scores['zaidan_nu'].append(zaidan_nu) else: model_scores['zaidan_C'].append(0.0) model_scores['zaidan_Ccontrast'].append(0.0) model_scores['zaidan_nu'].append(0.0) if model_type=='mnb' or model_type=='mnb_LwoR': model_scores['alpha'].append(alpha) else: model_scores['alpha'].append(0.0) if model_type=='svm_linear' or model_type=='svm_linear_LwoR': model_scores['svm_C'].append(C) else: model_scores['svm_C'].append(0.0) if model_type=='svm_linear' or model_type=='svm_linear_LwoR' or model_type=='mnb' or model_type=='mnb_LwoR': model_scores['wr'].append(w_r) model_scores['wo'].append(w_o) else: model_scores['wr'].append(0.0) model_scores['wo'].append(0.0) num_training_samples.append(number_of_docs) feature_expert.rg.seed(seed) if selection_strategy == 'random': doc_pick_model = RandomStrategy(seed) elif selection_strategy == 'unc': doc_pick_model = UNCSampling() elif selection_strategy == "pnc": doc_pick_model = UNCPreferNoConflict() elif selection_strategy == "pnr": doc_pick_model = UNCPreferNoRationale() elif selection_strategy == "pr": doc_pick_model = UNCPreferRationale() elif selection_strategy == "pc": doc_pick_model = UNCPreferConflict() elif selection_strategy == "tt": doc_pick_model = UNCThreeTypes() elif selection_strategy == "pipe": doc_pick_model = Pipe([UNCSampling(), UNCPreferConflict()], [10, 30]) else: raise ValueError('Selection strategy: \'%s\' invalid!' % selection_strategy) k = step_size #while X_train.shape[0] < budget: while number_of_docs < budget: # Choose a document based on the strategy chosen if selection_strategy == "pnc": doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k, rationales_c0, rationales_c1, topk) elif selection_strategy == "pnr": doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k, rationales_c0, rationales_c1, topk) elif selection_strategy == "pr": doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k, rationales_c0, rationales_c1, topk) elif selection_strategy == "pc": doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k, rationales_c0, rationales_c1, topk) elif selection_strategy == "tt": doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k, rationales_c0, rationales_c1, topk) elif selection_strategy == "pipe": doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k, rationales_c0, rationales_c1, topk) else: doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k) if doc_ids is None or len(doc_ids) == 0: break for doc_id in doc_ids: #feature = feature_expert.most_informative_feature(X_pool[doc_id], y_pool[doc_id]) feature = feature_expert.any_informative_feature(X_pool[doc_id], y_pool[doc_id]) all_features.append(feature) number_of_docs=number_of_docs + 1 if feature is not None: rationales.add(feature) if y_pool[doc_id] == 0: rationales_c0.add(feature) else: rationales_c1.add(feature) # Remove the chosen document from pool and add it to the training set pool_set.remove(doc_id) training_set.append(long(doc_id)) if cvTrain: # get optimal parameters depending on the model_type X_train = None y_train = [] sample_weight = [] if model_type=='mnb_LwoR': if np.mod(number_of_docs,20)==10: w_r, w_o=optimalMNBLwoRParameters(X_pool[training_set], y_pool[training_set], all_features) feature_counter=0 for doc_id in training_set: x = sp.csr_matrix(X_pool[doc_id], dtype=float) x_feats = x[0].indices for f in x_feats: if f == all_features[feature_counter]: x[0,f] = w_r*x[0,f] else: x[0,f] = w_o*x[0,f] feature_counter=feature_counter+1 if not y_train: X_train = x else: X_train = sp.vstack((X_train, x)) y_train.append(y_pool[doc_id]) elif model_type=='mnb': if np.mod(number_of_docs,20)==10: w_r, w_o=optimalMNBParameters(X_pool[training_set], y_pool[training_set], all_features) feature_counter=0 for doc_id in training_set: x = sp.csr_matrix(X_pool[doc_id], dtype=float) x_feats = x[0].indices for f in x_feats: if f == all_features[feature_counter]: x[0,f] = w_r*x[0,f] else: x[0,f] = w_o*x[0,f] feature_counter=feature_counter+1 if not y_train: X_train = x else: X_train = sp.vstack((X_train, x)) y_train.append(y_pool[doc_id]) elif model_type=='svm_linear_LwoR': if np.mod(number_of_docs,20)==10: w_r, w_o, C= optimalSVMLwoRParameters(X_pool[training_set], y_pool[training_set], all_features, seed) feature_counter=0 for doc_id in training_set: x = sp.csr_matrix(X_pool[doc_id], dtype=np.float64) x_feats = x[0].indices for f in x_feats: if f == all_features[feature_counter]: x[0,f] = w_r*x[0,f] else: x[0,f] = w_o*x[0,f] feature_counter=feature_counter+1 if not y_train: X_train = x else: X_train = sp.vstack((X_train, x)) y_train.append(y_pool[doc_id]) elif model_type=='svm_linear': if np.mod(number_of_docs,20)==10: w_r, w_o, C= optimalSVMParameters(X_pool[training_set], y_pool[training_set], all_features, seed) feature_counter=0 for doc_id in training_set: x = sp.csr_matrix(X_pool[doc_id], dtype=np.float64) x_feats = x[0].indices for f in x_feats: if f == all_features[feature_counter]: x[0,f] = w_r*x[0,f] else: x[0,f] = w_o*x[0,f] feature_counter=feature_counter+1 if not y_train: X_train = x else: X_train = sp.vstack((X_train, x)) y_train.append(y_pool[doc_id]) if model_type=='poolingMNB': classpriors=np.zeros(2) classpriors[1]=(np.sum(y_pool[docs])*1.)/(len(docs)*1.) classpriors[0]= 1. - classpriors[1] if np.mod(number_of_docs,20)==10: alpha, poolingFM_r, poolingMNBWeights = optimalPoolingMNBParameters(X_pool[training_set], y_pool[training_set], all_features, smoothing, num_feat) feature_model=FeatureMNBUniform(rationales_c0, rationales_c1, num_feat, smoothing, classpriors, poolingFM_r) feature_counter=0 for doc_id in training_set: if all_features[feature_counter]: # updates feature model with features one at a time feature_model.fit(all_features[feature_counter], y_pool[doc_id]) feature_counter=feature_counter+1 x = sp.csr_matrix(X_pool[doc_id], dtype=float) if not y_train: X_train = x else: X_train = sp.vstack((X_train, x)) y_train.append(y_pool[doc_id]) if model_type=='Zaidan': if np.mod(number_of_docs,20)==10: zaidan_C, zaidan_Ccontrast, zaidan_nu = optimalZaidanParameters(X_pool[training_set], y_pool[training_set], all_features, seed) feature_counter=0 for doc_id in training_set: x = sp.csr_matrix(X_pool[doc_id], dtype=np.float64) if all_features[feature_counter] is not None: x_pseudo = (X_pool[doc_id]).todense() # create pseudoinstances based on rationales provided; one pseudoinstance is created for each rationale. x_feats = x[0].indices for f in x_feats: if f == all_features[feature_counter]: test= x[0,f] x_pseudo[0,f] = x[0,f]/zaidan_nu else: x_pseudo[0,f] = 0.0 x_pseudo=sp.csr_matrix(x_pseudo, dtype=np.float64) if not y_train: X_train = x if all_features[feature_counter] is not None: X_train = sp.vstack((X_train, x_pseudo)) else: X_train = sp.vstack((X_train, x)) if all_features[feature_counter] is not None: X_train = sp.vstack((X_train, x_pseudo)) y_train.append(y_pool[doc_id]) if all_features[feature_counter] is not None: # append y label again for the pseudoinstance created y_train.append(y_pool[doc_id]) sample_weight.append(zaidan_C) if all_features[feature_counter] is not None: # append instance weight=zaidan_Ccontrast for the pseudoinstance created sample_weight.append(zaidan_Ccontrast) feature_counter = feature_counter+1 # Train the model if model_type=='lrl2': random_state = np.random.RandomState(seed=seed) model = LogisticRegression(C=lr_C, penalty='l2', random_state=random_state) elif model_type=='lrl1': random_state = np.random.RandomState(seed=seed) model = LogisticRegression(C=lr_C, penalty='l1', random_state=random_state) elif model_type=='mnb': model = MultinomialNB(alpha=alpha) elif model_type=='mnb_LwoR': model = MultinomialNB(alpha=alpha) elif model_type=='svm_linear_LwoR': random_state = np.random.RandomState(seed=seed) model = LinearSVC(C=C, random_state=random_state) elif model_type=='svm_linear': random_state = np.random.RandomState(seed=seed) model = LinearSVC(C=C, random_state=random_state) elif model_type=='poolingMNB': instance_model=MultinomialNB(alpha=alpha) model = PoolingMNB() elif model_type=='Zaidan': random_state = np.random.RandomState(seed=seed) model = svm.SVC(kernel='linear', C=svm_C, random_state=random_state) if model_type=='poolingMNB': instance_model.fit(X_train, y_train) model.fit(instance_model, feature_model, weights=poolingMNBWeights) # train pooling_model elif model_type=='Zaidan': model.fit(X_train, np.array(y_train), sample_weight=sample_weight) else: model.fit(X_train, np.array(y_train)) (accu, auc) = evaluate_model(model, X_test, y_test) model_scores['auc'].append(auc) model_scores['accu'].append(accu) if model_type=='poolingMNB': model_scores['alpha'].append(alpha) model_scores['FMrvalue'].append(poolingFM_r) model_scores['IMweight'].append(poolingMNBWeights[0]) model_scores['FMweight'].append(poolingMNBWeights[1]) else: model_scores['FMrvalue'].append(0.0) model_scores['IMweight'].append(0.0) model_scores['FMweight'].append(0.0) if model_type=='Zaidan': model_scores['zaidan_C'].append(zaidan_C) model_scores['zaidan_Ccontrast'].append(zaidan_Ccontrast) model_scores['zaidan_nu'].append(zaidan_nu) else: model_scores['zaidan_C'].append(0.0) model_scores['zaidan_Ccontrast'].append(0.0) model_scores['zaidan_nu'].append(0.0) if model_type=='mnb' or model_type=='mnb_LwoR': model_scores['alpha'].append(alpha) else: model_scores['alpha'].append(0.0) if model_type=='svm_linear' or model_type=='svm_linear_LwoR': model_scores['svm_C'].append(C) else: model_scores['svm_C'].append(0.0) if model_type=='svm_linear' or model_type=='svm_linear_LwoR' or model_type=='mnb' or model_type=='mnb_LwoR': model_scores['wr'].append(w_r) model_scores['wo'].append(w_o) else: model_scores['wr'].append(0.0) model_scores['wo'].append(0.0) num_training_samples.append(number_of_docs) print 'Active Learning took %2.2fs' % (time() - start) return (np.array(num_training_samples), model_scores)
models = {'MultinomialNB(alpha=1)':MultinomialNB(alpha=1), \ 'LogisticRegression(C=1, penalty=\'l1\')':LogisticRegression(C=1, penalty='l1'), \ 'LogisticRegression(C=0.1, penalty=\'l1\')':LogisticRegression(C=0.1, penalty='l1')} aucs = {} for mk in models.keys(): models[mk].fit(X_pool, y_pool) _, auc = evaluate_model(models[mk], X_test, y_test) aucs[mk] = auc fe = feature_expert(X_pool, y_pool, metric="L1", C=args.c) all_feature_model = FeatureMNBUniform(fe.feature_rank[0], fe.feature_rank[1], fe.num_features, smoothing=args.smoothing) all_feature_model.update() _, all_auc = evaluate_model(all_feature_model, X_test, y_test) k_feature_model = FeatureMNBUniform(fe.feature_rank[0][:args.k], fe.feature_rank[1][:args.k], fe.num_features, smoothing=args.smoothing) k_feature_model.update() _, k_auc = evaluate_model(k_feature_model, X_test, y_test) print '-' * 50
def choice(self, X, y, pool, train_indices, current_feature_model, current_reasoning_model, w_n, w_a): rand_indices = self.rgen.permutation(len(pool)) candidates = [pool[i] for i in rand_indices[:self.sub_pool]] aucs = [] for doc in candidates: new_train_indices = list(train_indices) new_train_indices.append(doc) # train an instance model instance_model = MultinomialNB(alpha=1.) instance_model.fit(X[new_train_indices], y[new_train_indices]) # train a feature model feature_model = None if isinstance(current_feature_model, FeatureMNBUniform): feature_model = FeatureMNBUniform(current_feature_model.class0_feats, current_feature_model.class1_feats, self.feature_expert.num_features, 0) elif isinstance(current_feature_model, FeatureMNBWeighted): feature_model = FeatureMNBWeighted(num_feat = self.feature_expert.num_features, feat_count = current_feature_model.feature_count_, imaginary_counts = current_feature_model.imaginary_counts) else: raise ValueError('Feature model type: \'%s\' unknown!' % current_feature_model.__class__.__name__) top_feat = self.feature_expert.most_informative_feature(X[doc], y[doc]) if top_feat: feature_model.fit(top_feat, y[doc]) # fit also calls update; so there is no need to update again else: feature_model.update() # make a deep copy of the reasoning model and partial train it reasoning_model = copy.deepcopy(current_reasoning_model) reasoning_model.partial_fit(X[doc], y[doc], top_feat, w_n, w_a) # pooling model pooling_model = PoolingMNB() pooling_model.fit(instance_model, feature_model, weights=[0.5, 0.5]) # evaluate opt_model = None if self.optimize == "P": opt_model = pooling_model elif self.optimize == "I": opt_model = instance_model elif self.optimize == "F": opt_model = feature_model elif self.optimize == "R": opt_model = reasoning_model else: raise ValueError('Optimization Model: \'%s\' invalid!' % self.optimize) y_probas = opt_model.predict_proba(self.X_test) auc = metrics.roc_auc_score(self.y_test, y_probas[:, 1]) aucs.append(auc) doc_id = candidates[np.argsort(aucs)[-1]] return doc_id
def optimalPoolingMNBParameters(X_pool, y_pool, all_features, smoothing, num_feat): grid_alpha=[0.01, 0.1, 1.0, 10.0, 100.0] grid_rValue=[100.0, 1000.0] grid_IMweight=[0.99, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1, 0.01] # search same values for wo, because this is learning without rationales max_auc=-1 optimal_alpha=-1 optimal_rValue=-1 optimal_model_weights=np.zeros(2) optimal_model_weights[0]=-1 optimal_model_weights[1]=-1 for alpha in grid_alpha: for rValue in grid_rValue: for IMweight in grid_IMweight: all_probabilities=np.ndarray(shape=(len(y_pool),2)) kf = KFold(len(y_pool), n_folds=5) for train, test in kf: X_train = None y_train = [] rationales_c0 = set() rationales_c1 = set() poolingMNBWeights=np.zeros(2) #for doc_id in train: # if y_pool[doc_id] == 0: # rationales_c0.add(all_features[doc_id]) # else: # rationales_c1.add(all_features[doc_id]) feature_model=FeatureMNBUniform(rationales_c0, rationales_c1, num_feat, smoothing, [0.5, 0.5], rValue) for doc_id in train: if all_features[doc_id]: feature_model.fit(all_features[doc_id], y_pool[doc_id]) x = sp.csr_matrix(X_pool[doc_id], dtype=float) if not y_train: X_train = x else: X_train = sp.vstack((X_train, x)) y_train.append(y_pool[doc_id]) instance_model=MultinomialNB(alpha=alpha) instance_model.fit(X_train, y_train) model = PoolingMNB() weights=np.zeros(2) poolingMNBWeights[0]=IMweight poolingMNBWeights[1]=1. - IMweight model.fit(instance_model, feature_model, weights=poolingMNBWeights) # train pooling_model X_test=X_pool[test] y_test=y_pool[test] y_probas = model.predict_proba(X_test) counter=0 for t in test: all_probabilities[t]=y_probas[counter] counter=counter+1 # compute AUC based on all instances in the training data auc = metrics.roc_auc_score(y_pool, all_probabilities[:, 1]) if auc > max_auc: max_auc = auc optimal_alpha = alpha optimal_rValue = rValue optimal_model_weights[0] = IMweight optimal_model_weights[1] = 1. - IMweight return optimal_alpha, optimal_rValue, optimal_model_weights