Exemplo n.º 1
0
def run_trials(model_type, num_trials, dataset, tfidf, selection_strategy, metric, C, alpha, poolingMNBWeights, Meville_etal_r,\
                Zaidan_etal_C, Zaidan_etal_Ccontrast, Zaidan_etal_mu, bootstrap_size, balance, budget, step_size, topk, w_o, w_r, seed=0, lr_C=1, svm_C=1, Debug=False):
    
    (X_pool, y_pool, X_test, y_test, feat_names) = load_dataset(dataset)

        
    if not feat_names:
        feat_names = np.arange(X_pool.shape[1])
    
    feat_freq = np.diff(X_pool.tocsc().indptr)   
    
    fe = feature_expert(X_pool, y_pool, metric, C=C, pick_only_top=True)
    
    tfidft = TfidfTransformer()
    
    if tfidf:
        print "Performing tf-idf transformation"
        X_pool = tfidft.fit_transform(X_pool)
        X_test = tfidft.transform(X_test)        
    
    result = np.ndarray(num_trials, dtype=object)
    
    for i in range(num_trials):
        print '-' * 50
        print 'Starting Trial %d of %d...' % (i + 1, num_trials)

        trial_seed = seed + i # initialize the seed for the trial
        
        training_set, pool_set = RandomBootstrap(X_pool, y_pool, bootstrap_size, balance, trial_seed)
                
        result[i] = learn(model_type, X_pool, y_pool, X_test, y_test, training_set, pool_set, fe, \
                          selection_strategy, budget, step_size, topk, w_o, w_r, trial_seed, alpha, poolingMNBWeights, Meville_etal_r, lr_C, svm_C, Zaidan_etal_C, Zaidan_etal_Ccontrast, Zaidan_etal_mu, Debug)
    
    return result, feat_names, feat_freq
def run_trials(model_type, num_trials, dataset, tfidf, selection_strategy, metric, C, alpha, poolingMNBWeights, Meville_etal_r,\
                Zaidan_etal_C, Zaidan_etal_Ccontrast, Zaidan_etal_mu, bootstrap_size, balance, budget, step_size, topk, w_o, w_r, seed=0, lr_C=1, svm_C=1, Debug=False):

    (X_pool, y_pool, X_test, y_test, feat_names) = load_dataset(dataset)

    if not feat_names:
        feat_names = np.arange(X_pool.shape[1])

    feat_freq = np.diff(X_pool.tocsc().indptr)

    fe = feature_expert(X_pool, y_pool, metric, C=C, pick_only_top=True)

    tfidft = TfidfTransformer()

    if tfidf:
        print("Performing tf-idf transformation")
        X_pool = tfidft.fit_transform(X_pool)
        X_test = tfidft.transform(X_test)

    result = np.ndarray(num_trials, dtype=object)

    for i in range(num_trials):
        print('-' * 50)
        print('Starting Trial %d of %d...' % (i + 1, num_trials))

        trial_seed = seed + i  # initialize the seed for the trial

        training_set, pool_set = RandomBootstrap(X_pool, y_pool,
                                                 bootstrap_size, balance,
                                                 trial_seed)

        result[i] = learn(model_type, X_pool, y_pool, X_test, y_test, training_set, pool_set, fe, \
                          selection_strategy, budget, step_size, topk, w_o, w_r, trial_seed, alpha, poolingMNBWeights, Meville_etal_r, lr_C, svm_C, Zaidan_etal_C, Zaidan_etal_Ccontrast, Zaidan_etal_mu, Debug)

    return result, feat_names, feat_freq
Exemplo n.º 3
0
def extract_features(texts, classes):
    vect = CountVectorizer(analyzer=u'word',
                           binary=True,
                           decode_error=u'ignore',
                           lowercase=True,
                           max_df=1.0,
                           max_features=None,
                           min_df=5,
                           ngram_range=(1, 1),
                           preprocessor=None,
                           stop_words=None,
                           strip_accents=None,
                           token_pattern=u'(?u)\\b\\w\\w+\\b',
                           tokenizer=None,
                           vocabulary=None)

    X = vect.fit_transform(texts)
    idx = np.where(X.sum(axis=1) > 0)[0]
    X = X[idx]
    y = np.array(classes)[idx]

    fe = feature_expert(X, y, metric='chi2', pick_only_top=True)

    i2w = {vect.vocabulary_[w]: w for w in vect.vocabulary_}
    pos_words = [i2w[i] for i in fe.feature_rank[0]]
    neg_words = [i2w[i] for i in fe.feature_rank[1]]

    return pos_words, neg_words
Exemplo n.º 4
0
def run_trials(num_trials, dataset, selection_strategy, metric, C, alpha, smoothing, \
                bootstrap_size, balance, coverage, disagree_strat, budget, fmtype, rmw_n, rmw_a, seed=0, Debug=False, \
                reasoning_strategy='random', switch=40):

    (X_pool, y_pool, X_test, y_test, feat_names) = load_dataset(dataset)

    if not feat_names:
        feat_names = np.arange(X_pool.shape[1])

    feat_freq = np.diff(X_pool.tocsc().indptr)

    fe = feature_expert(X_pool,
                        y_pool,
                        metric,
                        smoothing=1e-6,
                        C=C,
                        pick_only_top=True)
    result = np.ndarray(num_trials, dtype=object)

    for i in range(num_trials):
        print '-' * 50
        print 'Starting Trial %d of %d...' % (i + 1, num_trials)

        trial_seed = seed + i  # initialize the seed for the trial

        instance_model = MultinomialNB(alpha=alpha)

        feature_model = None
        if fmtype == "fm_uniform":
            feature_model = FeatureMNBUniform([], [], fe.num_features,
                                              smoothing)
        elif fmtype == "fm_weighted":
            feature_model = FeatureMNBWeighted(num_feat=fe.num_features,
                                               imaginary_counts=1.)
        else:
            raise ValueError('Feature model type: \'%s\' invalid!' % fmtype)

        pooling_model = PoolingMNB()

        reasoning_model = ReasoningMNB(alpha=1)

        if bootstrap_size == 0:
            training_set = []
            pool_set = range(X_pool.shape[0])
        else:
            training_set, pool_set = RandomBootstrap(X_pool, y_pool,
                                                     bootstrap_size, balance,
                                                     trial_seed)

        result[i] = learn(X_pool, y_pool, X_test, y_test, training_set, pool_set, \
            fe, selection_strategy, disagree_strat, coverage, budget, instance_model, \
            feature_model, pooling_model, reasoning_model, rmw_n, rmw_a, trial_seed, Debug, \
            reasoning_strategy, switch)

    return result, feat_names, feat_freq
Exemplo n.º 5
0
def covering(dataset, type='any', metric='L1', C=0.1, smoothing=1e-6):
    X_pool, y_pool, X_test, y_test, feat_names = load_dataset(dataset)
    num_samples, num_feat = X_pool.shape

    fe = feature_expert(X_pool, y_pool, metric, smoothing, C)
    feature_count = np.zeros(num_feat)

    # no_feature_docs[0] counts # of documents labeled 0 but without any features
    # no_feature_docs[1] counts # of documents labeled 1 but without any features
    no_feature_docs = np.zeros(2)

    for doc in range(num_samples):
        label = y_pool[doc]

        if type == 'agnostic':
            top_class0_feature = fe.top_n_class0_features(X_pool[doc], 1)
            top_class1_feature = fe.top_n_class1_features(X_pool[doc], 1)

            if len(top_class0_feature) == 0 and len(top_class1_feature) == 0:
                no_feature_docs[label] += 1
            elif len(top_class0_feature) == 0 and len(top_class1_feature) != 0:
                # if there is no class 1 feature, then the top feature is the class0's top feature
                top_feature = top_class1_feature[0]
                feature_count[top_feature] += 1
            elif len(top_class0_feature) != 0 and len(top_class1_feature) == 0:
                # if there is no class 0 feature, then the top feature is the class1's top feature
                top_feature = top_class0_feature[0]
                feature_count[top_feature] += 1
            else:
                # if both classes have a valid top feature, then compare the absolute value of the weights
                # of both features to determine the top feature for this document
                class0_feature_weight = fe.L1_weights[top_class0_feature[0]]
                class1_feature_weight = fe.L1_weights[top_class1_feature[0]]

                if np.absolute(class0_feature_weight) >= np.absolute(
                        class1_feature_weight):
                    top_feature = top_class0_feature[0]
                else:
                    top_feature = top_class1_feature[0]

                feature_count[top_feature] += 1

        elif type == 'sensitive':
            feature = fe.most_informative_feature(X_pool[doc], label)
            if feature == None:
                no_feature_docs[label] += 1
            else:
                feature_count[feature] += 1

    print 'number of features needed to cover the entire corpus = %d' % len(
        np.nonzero(feature_count)[0])
    print 'number of uncovered class 0 documents: %d' % no_feature_docs[0]
    print 'number of uncovered class 1 documents: %d' % no_feature_docs[1]
    pickle.dump(feature_count, open('feature_count.pickle', 'wb'))
    pickle.dump(no_feature_docs, open('uncovered_count.pickle', 'wb'))
Exemplo n.º 6
0
def covering(dataset, type='any', metric='L1', C=0.1, smoothing=1e-6):
    X_pool, y_pool, X_test, y_test, feat_names = load_dataset(dataset)
    num_samples, num_feat = X_pool.shape
    
    fe = feature_expert(X_pool, y_pool, metric, smoothing, C)
    feature_count = np.zeros(num_feat)
    
    # no_feature_docs[0] counts # of documents labeled 0 but without any features
    # no_feature_docs[1] counts # of documents labeled 1 but without any features
    no_feature_docs = np.zeros(2)
    
    for doc in range(num_samples):
        label = y_pool[doc]
        
        if type == 'agnostic':
            top_class0_feature = fe.top_n_class0_features(X_pool[doc], 1)
            top_class1_feature = fe.top_n_class1_features(X_pool[doc], 1)
            
            if len(top_class0_feature) == 0 and len(top_class1_feature) == 0:
                no_feature_docs[label] += 1
            elif len(top_class0_feature) == 0 and len(top_class1_feature) != 0:
                # if there is no class 1 feature, then the top feature is the class0's top feature
                top_feature = top_class1_feature[0]
                feature_count[top_feature] += 1
            elif len(top_class0_feature) != 0 and len(top_class1_feature) == 0:
                # if there is no class 0 feature, then the top feature is the class1's top feature
                top_feature = top_class0_feature[0]
                feature_count[top_feature] += 1
            else:
                # if both classes have a valid top feature, then compare the absolute value of the weights
                # of both features to determine the top feature for this document
                class0_feature_weight = fe.L1_weights[top_class0_feature[0]]
                class1_feature_weight = fe.L1_weights[top_class1_feature[0]]
                
                if np.absolute(class0_feature_weight) >= np.absolute(class1_feature_weight):
                    top_feature = top_class0_feature[0]
                else:
                    top_feature = top_class1_feature[0]
                
                feature_count[top_feature] += 1
            
        elif type == 'sensitive':
            feature = fe.most_informative_feature(X_pool[doc], label)
            if feature == None:
                no_feature_docs[label] += 1
            else:
                feature_count[feature] += 1
            
    print 'number of features needed to cover the entire corpus = %d' % len(np.nonzero(feature_count)[0])
    print 'number of uncovered class 0 documents: %d' % no_feature_docs[0]
    print 'number of uncovered class 1 documents: %d' % no_feature_docs[1]
    pickle.dump(feature_count, open('feature_count.pickle', 'wb'))
    pickle.dump(no_feature_docs, open('uncovered_count.pickle', 'wb'))
Exemplo n.º 7
0
def run_trials(model_type, num_trials, dataset, selection_strategy, metric, C, alpha, \
                bootstrap_size, balance, budget, step_size, topk, w_o, w_r, seed=0, lr_C=1, svm_C=1, svm_gamma=0, Debug=False):
    
    (X_pool, y_pool, X_test, y_test, feat_names) = load_dataset(dataset)

        
    if not feat_names:
        feat_names = np.arange(X_pool.shape[1])
    
    feat_freq = np.diff(X_pool.tocsc().indptr)   
    
    fe = feature_expert(X_pool, y_pool, metric, smoothing=1e-6, C=C, pick_only_top=True)
    result = np.ndarray(num_trials, dtype=object)
    
    for i in range(num_trials):
        print '-' * 50
        print 'Starting Trial %d of %d...' % (i + 1, num_trials)

        trial_seed = seed + i # initialize the seed for the trial
        
        training_set, pool_set = RandomBootstrap(X_pool, y_pool, bootstrap_size, balance, trial_seed)

        # In order to get the best parameters
        if 0:
            # Train classifier
            #
            # For an initial search, a logarithmic grid with basis
            # 10 is often helpful. Using a basis of 2, a finer
            # tuning can be achieved but at a much higher cost.

            C_range = 10.0 ** np.arange(-5, 9)
            gamma_range = 10.0 ** np.arange(-5, 5)
            param_grid = dict(gamma=gamma_range, C=C_range)
            cv = StratifiedKFold(y=y_pool, n_folds=5)
            grid = GridSearchCV(SVC(kernel='poly'), param_grid=param_grid, cv=cv)
            grid.fit(X_pool, np.array(y_pool))
            print("The best classifier is: ", grid.best_estimator_)

            # Now we need to fit a classifier for all parameters in the 2d version
            # (we use a smaller set of parameters here because it takes a while to train)
            C_2d_range = [1, 1e2, 1e4]
            gamma_2d_range = [1e-1, 1, 1e1]
            classifiers = []
            for C in C_2d_range:
                for gamma in gamma_2d_range:
                    clf = SVC(C=C, gamma=gamma)
                    clf.fit(X_pool, np.array(y_pool))
                    classifiers.append((C, gamma, clf))
                
        result[i] = learn(model_type, X_pool, y_pool, X_test, y_test, training_set, pool_set, fe, \
                          selection_strategy, budget, step_size, topk, w_o, w_r, trial_seed, lr_C, svm_C, svm_gamma, Debug)
    
    return result, feat_names, feat_freq
Exemplo n.º 8
0
def run_trials(num_trials, dataset, selection_strategy, metric, C, alpha, smoothing, \
                bootstrap_size, balance, coverage, disagree_strat, budget, fmtype, rmw_n, rmw_a, seed=0, Debug=False, \
                reasoning_strategy='random', switch=40):
    
    (X_pool, y_pool, X_test, y_test, feat_names) = load_dataset(dataset)
    
    if not feat_names:
        feat_names = np.arange(X_pool.shape[1])
    
    feat_freq = np.diff(X_pool.tocsc().indptr)   
    
    fe = feature_expert(X_pool, y_pool, metric, smoothing=1e-6, C=C, pick_only_top=True)
    result = np.ndarray(num_trials, dtype=object)
    
    for i in range(num_trials):
        print '-' * 50
        print 'Starting Trial %d of %d...' % (i + 1, num_trials)

        trial_seed = seed + i # initialize the seed for the trial
        
        instance_model = MultinomialNB(alpha=alpha)
        
        feature_model = None 
        if fmtype == "fm_uniform":
            feature_model = FeatureMNBUniform([], [], fe.num_features, smoothing)
        elif fmtype == "fm_weighted":
            feature_model = FeatureMNBWeighted(num_feat = fe.num_features, imaginary_counts = 1.)
        else:
            raise ValueError('Feature model type: \'%s\' invalid!' % fmtype)
            
        pooling_model = PoolingMNB()
        
        reasoning_model = ReasoningMNB(alpha=1)

        if bootstrap_size == 0:
            training_set = []
            pool_set = range(X_pool.shape[0])
        else:
            training_set, pool_set = RandomBootstrap(X_pool, y_pool, bootstrap_size, balance, trial_seed)
        
        result[i] = learn(X_pool, y_pool, X_test, y_test, training_set, pool_set, \
            fe, selection_strategy, disagree_strat, coverage, budget, instance_model, \
            feature_model, pooling_model, reasoning_model, rmw_n, rmw_a, trial_seed, Debug, \
            reasoning_strategy, switch)
    
    return result, feat_names, feat_freq
Exemplo n.º 9
0
def full_knowledge(dataset, metric='mutual_info', C=0.1, alpha=1, smoothing=0):
    '''
    This function uses the entire pool to learn the instance model,
    feature model, and pooling model which provides an upper bound on how well
    these models can perform on this particular dataset
    '''
    (X_pool,  y_pool, X_test, y_test, feat_names) = load_dataset(dataset)
    fe = feature_expert(X_pool, y_pool, metric, smoothing=1e-6, C=C)
    
    instance_model_scores = {'auc':[], 'accu':[]}
    feature_model_scores = {'auc':[], 'accu':[]}
    pooling_model_scores = {'auc':[], 'accu':[]}
    
    instance_model = MultinomialNB(alpha=alpha)
    feature_model = FeatureMNBUniform([], [], fe.num_features, smoothing)
    pooling_model = PoolingMNB()

    instance_model.fit(X_pool, y_pool)
    for doc in range(X_pool.shape[0]):
        feature = fe.most_informative_feature(X_pool[doc], y_pool[doc])
        feature_model.fit(feature, y_pool[doc])

    # Evaluate performance based on Instance Model
    (accu, auc) = evaluate_model(instance_model, X_test, y_test)
    instance_model_scores['auc'].append(auc)
    instance_model_scores['accu'].append(accu)
    print 'Instance Model: auc = %f, accu = %f' % (auc, accu)
    
    # Evaluate performance on Feature Model
    (accu, auc) = evaluate_model(feature_model, X_test, y_test)
    feature_model_scores['auc'].append(auc)
    feature_model_scores['accu'].append(accu)
    print 'Feature Model: auc = %f, accu = %f' % (auc, accu)
    
    # Evaluate performance on Pooled Model
    pooling_model.fit(instance_model, feature_model, weights=[0.5, 0.5])
    (accu, auc) = evaluate_model(pooling_model, X_test, y_test)
    pooling_model_scores['auc'].append(auc)
    pooling_model_scores['accu'].append(accu)
    print 'Pooled Model: auc = %f, accu = %f' % (auc, accu)
Exemplo n.º 10
0
def full_knowledge(dataset, metric='mutual_info', C=0.1, alpha=1, smoothing=0):
    '''
    This function uses the entire pool to learn the instance model,
    feature model, and pooling model which provides an upper bound on how well
    these models can perform on this particular dataset
    '''
    (X_pool, y_pool, X_test, y_test, feat_names) = load_dataset(dataset)
    fe = feature_expert(X_pool, y_pool, metric, smoothing=1e-6, C=C)

    instance_model_scores = {'auc': [], 'accu': []}
    feature_model_scores = {'auc': [], 'accu': []}
    pooling_model_scores = {'auc': [], 'accu': []}

    instance_model = MultinomialNB(alpha=alpha)
    feature_model = FeatureMNBUniform([], [], fe.num_features, smoothing)
    pooling_model = PoolingMNB()

    instance_model.fit(X_pool, y_pool)
    for doc in range(X_pool.shape[0]):
        feature = fe.most_informative_feature(X_pool[doc], y_pool[doc])
        feature_model.fit(feature, y_pool[doc])

    # Evaluate performance based on Instance Model
    (accu, auc) = evaluate_model(instance_model, X_test, y_test)
    instance_model_scores['auc'].append(auc)
    instance_model_scores['accu'].append(accu)
    print 'Instance Model: auc = %f, accu = %f' % (auc, accu)

    # Evaluate performance on Feature Model
    (accu, auc) = evaluate_model(feature_model, X_test, y_test)
    feature_model_scores['auc'].append(auc)
    feature_model_scores['accu'].append(accu)
    print 'Feature Model: auc = %f, accu = %f' % (auc, accu)

    # Evaluate performance on Pooled Model
    pooling_model.fit(instance_model, feature_model, weights=[0.5, 0.5])
    (accu, auc) = evaluate_model(pooling_model, X_test, y_test)
    pooling_model_scores['auc'].append(auc)
    pooling_model_scores['accu'].append(accu)
    print 'Pooled Model: auc = %f, accu = %f' % (auc, accu)
Exemplo n.º 11
0
def run_trials(num_trials, dataset, selection_strategy, metric, C, alpha, \
                bootstrap_size, balance, budget, rmw_n, rmw_a, seed=0, Debug=False):

    (X_pool, y_pool, X_test, y_test, feat_names) = load_dataset(dataset)

    if not feat_names:
        feat_names = np.arange(X_pool.shape[1])

    feat_freq = np.diff(X_pool.tocsc().indptr)

    fe = feature_expert(X_pool,
                        y_pool,
                        metric,
                        smoothing=1e-6,
                        C=C,
                        pick_only_top=True)
    result = np.ndarray(num_trials, dtype=object)

    for i in range(num_trials):
        print '-' * 50
        print 'Starting Trial %d of %d...' % (i + 1, num_trials)

        trial_seed = seed + i  # initialize the seed for the trial

        instance_model = MultinomialNB(alpha=alpha)

        reasoning_model = ReasoningMNB(alpha=1)

        training_set, pool_set = RandomBootstrap(X_pool, y_pool,
                                                 bootstrap_size, balance,
                                                 trial_seed)

        result[i] = learn(X_pool, y_pool, X_test, y_test, training_set, pool_set, fe, \
                          selection_strategy, budget, instance_model, reasoning_model, rmw_n, rmw_a, trial_seed, Debug)

    return result, feat_names, feat_freq
Exemplo n.º 12
0
    (X_pool, y_pool, X_test, y_test, feat_names) = load_dataset(args.dataset)

    num_inst, num_feat = X_pool.shape

    if not feat_names:
        feat_names = np.arange(num_feat)

    feature_frequency = np.diff(X_pool.tocsc().indptr)

    c0_frequency = np.diff(X_pool[np.nonzero(y_pool == 0)[0]].tocsc().indptr)
    c1_frequency = np.diff(X_pool[np.nonzero(y_pool == 1)[0]].tocsc().indptr)

    fe = feature_expert(X_pool,
                        y_pool,
                        metric=args.metric,
                        smoothing=1e-6,
                        C=args.c,
                        pick_only_top=False)

    print '-' * 50

    print "FEATURE ANALYSIS"

    print '-' * 50

    print "Number of all features: %d" % (X_pool.shape[1])
    print "Number of non-zero features: %d" % (len(fe.feature_rank[0]) +
                                               len(fe.feature_rank[1]))
    print "# of class 0 features: %d" % (len(fe.feature_rank[0]))
    print "# of class 1 features: %d" % (len(fe.feature_rank[1]))
Exemplo n.º 13
0
def run_trials(model_type, num_trials, dataset, selection_strategy, metric, C, alpha, \
                bootstrap_size, balance, budget, step_size, topk, w_o, w_r, seed=0, lr_C=1, svm_C=1, svm_gamma=0, zaidan_C=0.01, zaidan_Ccontrast=1.0, zaidan_nu=1.0, Debug=False):

    (X_pool, y_pool, X_test, y_test, feat_names) = load_dataset(dataset)

    if not feat_names:
        feat_names = np.arange(X_pool.shape[1])

    feat_freq = np.diff(X_pool.tocsc().indptr)

    fe = feature_expert(X_pool,
                        y_pool,
                        metric,
                        smoothing=1e-6,
                        C=C,
                        pick_only_top=True)
    result = np.ndarray(num_trials, dtype=object)

    for i in range(num_trials):
        print '-' * 50
        print 'Starting Trial %d of %d...' % (i + 1, num_trials)

        trial_seed = seed + i  # initialize the seed for the trial

        training_set, pool_set = RandomBootstrap(X_pool, y_pool,
                                                 bootstrap_size, balance,
                                                 trial_seed)

        # In order to get the best parameters
        if 0:
            # Train classifier
            #
            # For an initial search, a logarithmic grid with basis
            # 10 is often helpful. Using a basis of 2, a finer
            # tuning can be achieved but at a much higher cost.

            C_range = 10.0**np.arange(-5, 9)
            gamma_range = 10.0**np.arange(-5, 5)
            param_grid = dict(gamma=gamma_range, C=C_range)
            cv = StratifiedKFold(y=y_pool, n_folds=5)
            grid = GridSearchCV(SVC(kernel='poly'),
                                param_grid=param_grid,
                                cv=cv)
            grid.fit(X_pool, np.array(y_pool))
            print("The best classifier is: ", grid.best_estimator_)

            # Now we need to fit a classifier for all parameters in the 2d version
            # (we use a smaller set of parameters here because it takes a while to train)
            C_2d_range = [1, 1e2, 1e4]
            gamma_2d_range = [1e-1, 1, 1e1]
            classifiers = []
            for C in C_2d_range:
                for gamma in gamma_2d_range:
                    clf = SVC(C=C, gamma=gamma)
                    clf.fit(X_pool, np.array(y_pool))
                    classifiers.append((C, gamma, clf))

        result[i] = learn(model_type, X_pool, y_pool, X_test, y_test, training_set, pool_set, fe, \
                          selection_strategy, budget, step_size, topk, w_o, w_r, trial_seed, lr_C, svm_C, svm_gamma, zaidan_C, zaidan_Ccontrast, zaidan_nu, Debug)

    return result, feat_names, feat_freq
Exemplo n.º 14
0
def covering(dataset='imdb',
             first='positive',
             agreement='any',
             metric='mutual_info',
             smoothing=1e-6,
             C=1):
    if first == 'positive':
        offset = 1
    else:
        offset = 0
    class_label = {0: 'negative', 1: 'positive'}
    vect = CountVectorizer(min_df=5,
                           max_df=1.0,
                           binary=True,
                           ngram_range=(1, 1))

    if dataset == 'imdb':
        X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs = load_imdb(
            "./aclImdb", shuffle=True, vectorizer=vect)
    elif isinstance(
            dataset,
            tuple) and len(dataset) == 3 and dataset[0] == 'newsgroups':
        X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs = \
        load_newsgroups(class1=dataset[1], class2=dataset[2], shuffle=False, random_state=42, \
            vectorizer=vect)

    feature_names = vect.get_feature_names()
    fe = feature_expert(X_pool, y_pool, metric, smoothing, C)

    print 'class 0 features (ranked):'
    print ', '.join(
        [str((f, feature_names[f])) for f in fe.class0_features_by_rank()])

    print 'class 1 features (ranked):'
    print ', '.join(
        [str((f, feature_names[f])) for f in fe.class1_features_by_rank()])

    sample_pool = range(X_pool.shape[0])
    feature_list = list()
    X_csc = X_pool.tocsc()

    feature_num = 0

    while len(sample_pool) != 0:
        label = (feature_num + offset) % 2  # label for the document
        rank = feature_num / 2  # rank of the feature in the list
        feature_num += 1

        if rank < len(fe.feature_rank[label]):
            feature = fe.feature_rank[label][rank]
        else:
            print '*' * 50
            print ', '.join(['#' + str(doc)
                             for doc in sample_pool]) + ' are uncovered'
            for doc in sample_pool:
                print '-' * 50
                print 'Document #%d:' % doc
                print '=' * 50
                print 'length = %d' % len(X_pool_docs[doc])
                print X_pool_docs[doc]
                print '=' * 50
                print X_pool[doc].indices
            break

        feature_name = feature_names[feature]
        docs_with_feature = X_csc.getcol(feature).indices

        docs_in_pool_with_feature = list(
            set(sample_pool).intersection(set(docs_with_feature)))
        if len(docs_in_pool_with_feature) == 0:
            continue
        else:
            num_docs_covered = len(docs_in_pool_with_feature)
            num_positive_docs = len(
                np.nonzero(y_pool[docs_in_pool_with_feature] == 1)[0])
            num_negative_docs = len(
                np.nonzero(y_pool[docs_in_pool_with_feature] == 0)[0])

            poolsize_before_removal = len(sample_pool)

            if agreement == 'agree':
                docs_with_label = np.nonzero(y_pool == label)[0]
                docs_to_remove = list(
                    set(docs_in_pool_with_feature).intersection(
                        set(docs_with_label)))
                sample_pool = list(
                    set(sample_pool).difference(set(docs_to_remove)))
            else:
                sample_pool = list(
                    set(sample_pool).difference(
                        set(docs_in_pool_with_feature)))

            # pack the information into a dictionary for easy printing
            result = dict()
            result['name'] = feature_name
            result['num'] = feature
            result['class'] = class_label[label]
            result['poolsize_before_removal'] = poolsize_before_removal
            result['num_docs_covered'] = num_docs_covered
            result['num_positive_docs'] = num_positive_docs
            result['num_negative_docs'] = num_negative_docs
            result['poolsize_after_removal'] = len(sample_pool)

            feature_list.append(result)

    return feature_list
Exemplo n.º 15
0
    args = parser.parse_args()

    (X_pool, y_pool, X_test, y_test, feat_names) = load_dataset(args.dataset)

    models = {'MultinomialNB(alpha=1)':MultinomialNB(alpha=1), \
              'LogisticRegression(C=1, penalty=\'l1\')':LogisticRegression(C=1, penalty='l1'), \
              'LogisticRegression(C=0.1, penalty=\'l1\')':LogisticRegression(C=0.1, penalty='l1')}

    aucs = {}

    for mk in models.keys():
        models[mk].fit(X_pool, y_pool)
        _, auc = evaluate_model(models[mk], X_test, y_test)
        aucs[mk] = auc

    fe = feature_expert(X_pool, y_pool, metric="L1", C=args.c)

    all_feature_model = FeatureMNBUniform(fe.feature_rank[0],
                                          fe.feature_rank[1],
                                          fe.num_features,
                                          smoothing=args.smoothing)
    all_feature_model.update()

    _, all_auc = evaluate_model(all_feature_model, X_test, y_test)

    k_feature_model = FeatureMNBUniform(fe.feature_rank[0][:args.k],
                                        fe.feature_rank[1][:args.k],
                                        fe.num_features,
                                        smoothing=args.smoothing)
    k_feature_model.update()
Exemplo n.º 16
0
def covering(dataset='imdb', first='positive', agreement='any', metric='mutual_info', smoothing=1e-6, C=1):
    if first == 'positive':
        offset = 1
    else:
        offset = 0
    class_label = {0:'negative', 1:'positive'}
    vect = CountVectorizer(min_df=5, max_df=1.0, binary=True, ngram_range=(1, 1))
    
    if dataset == 'imdb':
        X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs = load_imdb("./aclImdb", shuffle=True, vectorizer=vect)
    elif isinstance(dataset, tuple) and len(dataset) == 3 and dataset[0] == 'newsgroups':
        X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs = \
        load_newsgroups(class1=dataset[1], class2=dataset[2], shuffle=False, random_state=42, \
            vectorizer=vect)
    
    feature_names = vect.get_feature_names()
    fe = feature_expert(X_pool, y_pool, metric, smoothing, C)
        
    print 'class 0 features (ranked):'
    print ', '.join([str((f, feature_names[f])) for f in fe.class0_features_by_rank()])
    
    print 'class 1 features (ranked):'
    print ', '.join([str((f, feature_names[f])) for f in fe.class1_features_by_rank()])
    
    sample_pool = range(X_pool.shape[0])
    feature_list = list()
    X_csc = X_pool.tocsc()
    
    feature_num = 0

    while len(sample_pool) != 0:
        label = (feature_num + offset) % 2 # label for the document
        rank = feature_num / 2 # rank of the feature in the list
        feature_num += 1
        
        if rank < len(fe.feature_rank[label]):
            feature = fe.feature_rank[label][rank]
        else:
            print '*' * 50
            print ', '.join(['#'+str(doc) for doc in sample_pool]) + ' are uncovered'
            for doc in sample_pool:
                print '-' * 50
                print 'Document #%d:' % doc
                print '=' * 50
                print 'length = %d' % len(X_pool_docs[doc])
                print X_pool_docs[doc]
                print '=' * 50
                print X_pool[doc].indices
            break
            
        feature_name = feature_names[feature]
        docs_with_feature = X_csc.getcol(feature).indices

        docs_in_pool_with_feature = list(set(sample_pool).intersection(set(docs_with_feature)))
        if len(docs_in_pool_with_feature) == 0:
            continue
        else:
            num_docs_covered = len(docs_in_pool_with_feature)
            num_positive_docs = len(np.nonzero(y_pool[docs_in_pool_with_feature] == 1)[0])
            num_negative_docs = len(np.nonzero(y_pool[docs_in_pool_with_feature] == 0)[0])

            poolsize_before_removal = len(sample_pool)
            
            if agreement == 'agree':
                docs_with_label = np.nonzero(y_pool == label)[0]
                docs_to_remove = list(set(docs_in_pool_with_feature).intersection(set(docs_with_label)))
                sample_pool = list(set(sample_pool).difference(set(docs_to_remove)))
            else:
                sample_pool = list(set(sample_pool).difference(set(docs_in_pool_with_feature)))

            # pack the information into a dictionary for easy printing   
            result = dict()
            result['name'] = feature_name
            result['num'] = feature
            result['class'] = class_label[label]
            result['poolsize_before_removal'] = poolsize_before_removal
            result['num_docs_covered'] = num_docs_covered
            result['num_positive_docs'] = num_positive_docs
            result['num_negative_docs'] = num_negative_docs
            result['poolsize_after_removal'] = len(sample_pool)
            
            feature_list.append(result)

    return feature_list
Exemplo n.º 17
0
 args = parser.parse_args()
 
 (X_pool, y_pool, X_test, y_test, feat_names) = load_dataset(args.dataset)
 
 models = {'MultinomialNB(alpha=1)':MultinomialNB(alpha=1), \
           'LogisticRegression(C=1, penalty=\'l1\')':LogisticRegression(C=1, penalty='l1'), \
           'LogisticRegression(C=0.1, penalty=\'l1\')':LogisticRegression(C=0.1, penalty='l1')}
 
 aucs = {}
 
 for mk in models.keys():
     models[mk].fit(X_pool, y_pool)
     _, auc = evaluate_model(models[mk], X_test, y_test)
     aucs[mk] = auc
 
 fe = feature_expert(X_pool, y_pool, metric="L1", C=args.c)
 
 all_feature_model = FeatureMNBUniform(fe.feature_rank[0], fe.feature_rank[1], fe.num_features, smoothing=args.smoothing)
 all_feature_model.update()
 
 _, all_auc = evaluate_model(all_feature_model, X_test, y_test)
 
 
 k_feature_model = FeatureMNBUniform(fe.feature_rank[0][:args.k], fe.feature_rank[1][:args.k], fe.num_features, smoothing=args.smoothing)
 k_feature_model.update()
 
 _, k_auc = evaluate_model(k_feature_model, X_test, y_test)