示例#1
0
def run_atl(train_ds, test_ds, lbr, model, qs, quota, n_init_labeled):

    start_time = time.time()

    E_in, E_in_f1, E_out, E_out_f1 = [], [], [], []
    #E_out_P, E_out_R = [], []

    labels = []

    l = quota
    sup.printProgressBar(0,
                         l,
                         prefix='Progress:',
                         suffix='Complete',
                         length=50)

    for i in range(quota):
        # QBC
        ask_id = qs.make_query()
        # Labeler for QBC on train_ds and Random on train_ds2
        lb = lbr.label(train_ds.data[ask_id][0])
        # QBC
        train_ds.update(ask_id, lb)
        labels.append(lb)

        model.train(train_ds)

        X_train_current, y_train_current = train_ds.format_sklearn()
        E_in = np.append(E_in, model.score(train_ds))
        E_in_f1 = np.append(
            E_in_f1,
            f1_score(y_train_current,
                     model.predict(X_train_current),
                     pos_label=1,
                     average='binary',
                     sample_weight=None))

        X_test, y_test = test_ds.format_sklearn()
        E_out = np.append(E_out, model.score(test_ds))
        prec, recall, f1score, support = precision_recall_fscore_support(
            y_test, model.predict(X_test), average='binary')

        E_out_f1 = np.append(E_out_f1, f1score)
        #E_out_P = np.append(E_out_P, prec)
        #E_out_R = np.append(E_out_R, recall)

        # Update Progress Bar
        sup.printProgressBar(i + 1,
                             l,
                             prefix='Progress:',
                             suffix='Complete',
                             length=50)

    runt = time.time() - start_time
    print('Runtime: {:.2f} seconds'.format(runt))

    return E_in, E_in_f1, E_out, E_out_f1, model, runt
def calcDomainRelatednessCVinDict(candsets,
                                  all_features,
                                  dense_features_dict=None,
                                  cv=5,
                                  metric='phi'):
    d = {}

    combinations = []
    for combo in itertools.combinations(candsets, 2):
        if ((combo[0].split('_')[0] in combo[1].split('_'))
                or (combo[0].split('_')[1] in combo[1].split('_'))):
            combinations.append(combo)
    #print(combinations)

    l = len(combinations)
    sup.printProgressBar(0,
                         l,
                         prefix='Progress:',
                         suffix='Complete',
                         length=50)

    for i, combo in enumerate(combinations):
        d.update({
            combo: {
                'all':
                calcDomainRelatednessCV(candsets[combo[0]], candsets[combo[1]],
                                        all_features, cv, metric)
            }
        })
        # only the dense features
        if (dense_features_dict is not None):
            dense_feature_key = '_'.join(
                sorted(set(combo[0].split('_') + combo[1].split('_'))))
            d[combo].update({
                'dense':
                calcDomainRelatednessCV(candsets[combo[0]], candsets[combo[1]],
                                        dense_features_dict[dense_feature_key],
                                        cv, metric)
            })

        # Update Progress Bar
        sup.printProgressBar(i + 1,
                             l,
                             prefix='Progress:',
                             suffix='Complete',
                             length=50)

    return d
示例#3
0
def run_weighted_atl(train_ds, test_ds, lbr, model, qs, quota):

    start_time = time.time()

    E_in, E_in_f1, E_out, E_out_f1 = [], [], [], []
    E_out_P, E_out_R = [], []

    model_pred_prob, model_feature_import, model_depth_tree = [], [], []

    labels, corrected_labels = [], []

    X_test, y_test = test_ds.format_sklearn()

    l = quota
    sup.printProgressBar(0,
                         l,
                         prefix='Progress:',
                         suffix='Complete',
                         length=50)

    for i in range(quota):
        # QBC
        ask_id = qs.make_query()
        # Oracle
        lb = lbr.label(train_ds.data[ask_id][0])
        # QBC
        train_ds.update(ask_id, lb)
        labels.append(lb)

        corrected = 0
        if (train_ds._y_transfer_labels[ask_id] != lb):
            corrected = 1
            train_ds.update_transfer_labels(ask_id, lb)

        corrected_labels.append(corrected)

        if (model.model.warm_start and 1 in train_ds._y and 0 in train_ds._y
                and i > 0):
            # for RF if warm_start = True and i > 0 then there will be two trees added
            # to the forest which are trained only on the target instances queried. In the
            # case i == 1 then only the two bootstrapped target instances (one neg. and one pos. target instances)
            # are in the training set. For the subsequent runs the queried target instances will be in the training set
            model.model.n_estimators += 2
            model.train(train_ds, no_weights=True)

            # calculating the training score
            X_train_current, y_train_current = train_ds.format_sklearn(
                no_weights=True)
            E_in = np.append(
                E_in, model.score(Dataset(X=X_train_current,
                                          y=y_train_current)))
            E_in_f1 = np.append(
                E_in_f1,
                f1_score(y_train_current,
                         model.predict(X_train_current),
                         pos_label=1,
                         average='binary'))
        elif (model.model.warm_start and i == 0):
            # for RF with warm_start = True in the first itertation a forest with 10 trees
            # is trained on the source instances. If the source instances are weighted
            # based on importance weighting of domain adaptation then it is trained with them
            # as sample_weights however if no weighting was specified when initializing the
            # Dataset (i.e. SourceATLDataset) object, then it it trained without domain adaptation
            model.train(train_ds.get_source_training_data())
            # calculating the training score
            X_train_current, y_train_current = train_ds.format_sklearn(
                no_weights=True)
            E_in = np.append(
                E_in, model.score(Dataset(X=X_train_current,
                                          y=y_train_current)))
            E_in_f1 = np.append(
                E_in_f1,
                f1_score(y_train_current,
                         model.predict(X_train_current),
                         pos_label=1,
                         average='binary'))

            # get info about the RF model like pred prob on target test, feature importance, and depth of trees
            model_pred_prob.append(model.predict_proba(X_test))
            model_feature_import.append(model.feature_importances_())
            model_depth_tree.append(model.get_trees_max_depth())
        else:
            # for the case that we use a model other than RF as active learning model, we cannot use the warm_start
            # approch and hence we always learn a model on the source instance + the current labeled set. This is also
            # the case if we use RF with warm_start = False.
            X_source, y_source, sample_weights = train_ds.get_source_training_data(
            ).format_sklearn()
            X_target_current, y_target_current = train_ds.format_sklearn(
                no_weights=True)
            X_train_current = np.vstack([X_source, X_target_current])
            y_train_current = np.append(y_source, y_target_current)
            # assign a weight of 1 to each target instance
            sample_weights = np.concatenate(
                [sample_weights, [1] * (y_target_current.shape[0])])
            model.train(
                le.AWTLDataset(X_train_current, y_train_current,
                               sample_weights))
            # calculating the training score
            E_in = np.append(
                E_in, model.score(Dataset(X=X_train_current,
                                          y=y_train_current)))
            E_in_f1 = np.append(
                E_in_f1,
                f1_score(y_train_current,
                         model.predict(X_train_current),
                         pos_label=1,
                         average='binary'))

        # calculating the test score for this iteration. This is actually the interesting part!!!
        E_out = np.append(E_out, model.score(test_ds))
        prec, recall, f1score, support = precision_recall_fscore_support(
            y_test, model.predict(X_test), average='binary')

        if (i == quota - 1):
            model_pred_prob.append(model.predict_proba(X_test))
            model_feature_import.append(model.feature_importances_())
            model_depth_tree.append(model.get_trees_max_depth())
            print(
                'Last iteration Performance on Target Test Set: F1 {:.2f}; Prec {:.2f}; Recall {:.2f}'
                .format(E_out_f1[-1], E_out_P[-1], E_out_R[-1]))
            print(
                'Average depth of trees at start (iteration 0): {} at last iteration {}'
                .format(np.mean(model_depth_tree[0]),
                        np.mean(model_depth_tree[1])))

        E_out_f1 = np.append(E_out_f1, f1score)
        E_out_P = np.append(E_out_P, prec)
        E_out_R = np.append(E_out_R, recall)

        # Update Progress Bar
        sup.printProgressBar(i + 1,
                             l,
                             prefix='Progress:',
                             suffix='Complete',
                             length=50)

    share_of_corrected_labeles = sum(corrected_labels) / quota
    runt = time.time() - start_time
    print('Runtime: {:.2f} seconds'.format(runt))
    print('Corrected labels from transfer: {}'.format(sum(corrected_labels)))

    return E_in, E_in_f1, E_out, E_out_f1, E_out_P, E_out_R, model, runt, share_of_corrected_labeles, model_pred_prob, model_feature_import, model_depth_tree
示例#4
0
def performTLFromDict(candsets,candsets_train,candsets_test,estimators,all_features,dense_features_dict=None,da_weighting=None,n=10):
    """
    ***IMPORTANT*** -> Very time consuming. Hence, results should be saved to hard disk with saveTLResultsToJSON() function, so that the experiments not necessarily need to be repeated.
    Perform Transfer Learning Experiment for each combination of source-target pairs in candsets dictionary
    with naive transfer of matching rule trained on source instances and evaluated on all target 
    instances - target_train_size for all estimators specified in estimators and for all_features as well 
    as only dense features per combination. The results are averaged over n runs.
    
    @parameters
    candsets: Dictionary containing all candidate sets (pot. correspondences)
    candsets_train: Dictionary containing all training sets (pot. correspondences)
    candsets_test: Dictionary containing all test sets (pot. correspondences)
    estimators: Dicitionary with sklearn Estimators that shall be used for the TL Experiment. Dictionary should be of form {'logreg':LogisticRegression(),'logregcv':LogisticRegressionCV(),...}
    All_features: List of with all features
    Dense_features_dict: Dictionary with list of onle the dense feature for each combination. Exp: When source ban_half and target wor_half then the dense features across
    ban, half and wor need to be saved in a list which is the value of for dense_features_dict['ban_half_wor']. It is important that the key is compound of ban, half, wor in alphabetical order seperated by '_'
    n: specifies on how many random samples the experiments shall be performed and averaged. 100 will explode computing time!!! Default: 10
    """
    x_instances = [10,14,20,24,28,32,38,44,50,60,70,80,90,100,120,140,160,180,200,300,500]

    d = {}
    
    combinations = []
    for combo in itertools.combinations(candsets, 2):
        if((combo[0].split('_')[0] in combo[1].split('_')) or (combo[0].split('_')[1] in combo[1].split('_'))):
            combinations.append(combo)
    #print(combinations)
    
    l = len(combinations)
    sup.printProgressBar(0, l, prefix = 'Progress:', suffix = 'Complete', length = 50)
    
    for i, combo in enumerate(combinations):
        for clf in estimators:
            a_transfer_results = []
            a_target_results = []
            b_transfer_results = []
            b_target_results = []
            a_transfer_results_dense = []
            a_target_results_dense = []
            b_transfer_results_dense = []
            b_target_results_dense = []
            for x in x_instances:
                # all features (also non-dense ones)
                # transfer from a to b 
                res = getF1SourceTargetFixedAvg(candsets[combo[0]],candsets[combo[1]],candsets_train[combo[1]],candsets_test[combo[1]],estimators[clf],all_features,da_weighting,x,n)
                a_transfer_results.append(res[0])
                a_target_results.append(res[1])
                # transfer from b to a
                res = getF1SourceTargetFixedAvg(candsets[combo[1]],candsets[combo[0]],candsets_train[combo[0]],candsets_test[combo[0]],estimators[clf],all_features,da_weighting,x,n)
                b_transfer_results.append(res[0])
                b_target_results.append(res[1])
                
                if(dense_features_dict is not None):
                    # only the dense features
                    dense_feature_key = '_'.join(sorted(set(combo[0].split('_')+combo[1].split('_'))))
                    # transfer from a to b 
                    res = getF1SourceTargetFixedAvg(candsets[combo[0]],
                                                        candsets[combo[1]],
                                                        candsets_train[combo[1]],
                                                        candsets_test[combo[1]],
                                                        estimators[clf],
                                                        dense_features_dict[dense_feature_key],da_weighting,x,n)
                    a_transfer_results_dense.append(res[0])
                    a_target_results_dense.append(res[1])
                    # transfer from b to a
                    res = getF1SourceTargetFixedAvg(candsets[combo[1]],
                                                        candsets[combo[0]],
                                                        candsets_train[combo[0]],
                                                        candsets_test[combo[0]],
                                                        estimators[clf],
                                                        dense_features_dict[dense_feature_key],da_weighting,x,n)
                    b_transfer_results_dense.append(res[0])
                    b_target_results_dense.append(res[1])
            
            # all features
            a_transfer_res = sum(a_transfer_results)/len(x_instances)
            a_target_max = max(a_target_results)
            b_transfer_res = sum(b_transfer_results)/len(x_instances)
            b_target_max = max(b_target_results)
            try:
                idx = np.argwhere(np.diff(np.sign(np.array(a_transfer_results) - np.array(a_target_results)))).flatten()[0]
                a_x_target_instances = x_instances[idx]
            except Exception:
                a_x_target_instances = np.nan
            try:
                idx = np.argwhere(np.diff(np.sign(np.array(b_transfer_results) - np.array(b_target_results)))).flatten()[0]
                b_x_target_instances = x_instances[idx]
            except Exception:
                b_x_target_instances = np.nan
            if(combo not in d):
                if(da_weighting is None):
                    d.update({combo:{'all':{'no_weighting':{clf:{'transfer_avg_result':a_transfer_res,
                                                                       'target_max_result':a_target_max,
                                                                       'x_target_exceed':a_x_target_instances,
                                                                       'y_transfer_results':a_transfer_results,
                                                                       'y_target_results':a_target_results,
                                                                       'n_runs':n}}}}})
                else:
                    d.update({combo:{'all':{da_weighting:{clf:{'transfer_avg_result':a_transfer_res,
                                                                       'target_max_result':a_target_max,
                                                                       'x_target_exceed':a_x_target_instances,
                                                                       'y_transfer_results':a_transfer_results,
                                                                       'y_target_results':a_target_results,
                                                                       'n_runs':n}}}}})
            else:
                if(da_weighting is None):
                    d[combo]['all']['no_weighting'].update({clf:{'transfer_avg_result':a_transfer_res,
                                                                       'target_max_result':a_target_max,
                                                                       'x_target_exceed':a_x_target_instances,
                                                                       'y_transfer_results':a_transfer_results,
                                                                       'y_target_results':a_target_results,
                                                                       'n_runs':n}})
                else:
                    d[combo]['all'][da_weighting].update({clf:{'transfer_avg_result':a_transfer_res,
                                                                 'target_max_result':a_target_max,
                                                                       'x_target_exceed':a_x_target_instances,
                                                                       'y_transfer_results':a_transfer_results,
                                                                       'y_target_results':a_target_results,
                                                                       'n_runs':n}})                
            if(combo[::-1] not in d):
                if(da_weighting is None):
                    d.update({combo[::-1]:{'all':{'no_weighting':{clf:{'transfer_avg_result':b_transfer_res,
                                                                       'target_max_result':b_target_max,
                                                                       'x_target_exceed':b_x_target_instances,
                                                                       'y_transfer_results':b_transfer_results,
                                                                       'y_target_results':b_target_results,
                                                                       'n_runs':n}}}}})
                else:
                    d.update({combo[::-1]:{'all':{da_weighting:{clf:{'transfer_avg_result':b_transfer_res,
                                                                       'target_max_result':b_target_max,
                                                                       'x_target_exceed':b_x_target_instances,
                                                                       'y_transfer_results':b_transfer_results,
                                                                       'y_target_results':b_target_results,
                                                                       'n_runs':n}}}}})
            else:
                if(da_weighting is None):
                    d[combo[::-1]]['all']['no_weighting'].update({clf:{'transfer_avg_result':b_transfer_res,
                                                                       'target_max_result':b_target_max,
                                                                       'x_target_exceed':b_x_target_instances,
                                                                       'y_transfer_results':b_transfer_results,
                                                                       'y_target_results':b_target_results,
                                                                       'n_runs':n}})
                else:
                    d[combo[::-1]]['all'][da_weighting].update({clf:{'transfer_avg_result':b_transfer_res,
                                                                       'target_max_result':b_target_max,
                                                                       'x_target_exceed':b_x_target_instances,
                                                                       'y_transfer_results':b_transfer_results,
                                                                       'y_target_results':b_target_results,
                                                                       'n_runs':n}})
            if(dense_features_dict is not None):        
                # dense features
                a_transfer_res_dense = sum(a_transfer_results_dense)/len(x_instances)
                a_target_max_dense = max(a_target_results_dense)
                b_transfer_res_dense = sum(b_transfer_results_dense)/len(x_instances)
                b_target_max_dense = max(b_target_results_dense)
                try:
                    idx = np.argwhere(np.diff(np.sign(np.array(a_transfer_results_dense) - np.array(a_target_results_dense)))).flatten()[0]
                    a_x_target_instances_dense = x_instances[idx]
                except Exception:
                    a_x_target_instances_dense = np.nan
                try:
                    idx = np.argwhere(np.diff(np.sign(np.array(b_transfer_results_dense) - np.array(b_target_results_dense)))).flatten()[0]
                    b_x_target_instances_dense = x_instances[idx]
                except Exception:
                    b_x_target_instances_dense = np.nan
                if('dense' not in d[combo]):
                    if(da_weighting is None):
                        d[combo].update({'dense':{'no_weighting':{clf:{'transfer_avg_result':a_transfer_res_dense,
                                                   'target_max_result':a_target_max_dense,
                                                   'x_target_exceed':a_x_target_instances_dense,
                                                   'y_transfer_results':a_transfer_results_dense,
                                                   'y_target_results':a_target_results_dense,
                                                   'n_runs':n}}}})
                    else:
                        d[combo].update({'dense':{da_weighting:{clf:{'transfer_avg_result':a_transfer_res_dense,
                                                   'target_max_result':a_target_max_dense,
                                                   'x_target_exceed':a_x_target_instances_dense,
                                                   'y_transfer_results':a_transfer_results_dense,
                                                   'y_target_results':a_target_results_dense,
                                                   'n_runs':n}}}})
                else:
                    if(da_weighting is None):
                        d[combo]['dense']['no_weighting'].update({clf:{'transfer_avg_result':a_transfer_res_dense,
                                                   'target_max_result':a_target_max_dense,
                                                   'x_target_exceed':a_x_target_instances_dense,
                                                   'y_transfer_results':a_transfer_results_dense,
                                                   'y_target_results':a_target_results_dense,
                                                   'n_runs':n}})
                    else:
                        d[combo]['dense'][da_weighting].update({clf:{'transfer_avg_result':a_transfer_res_dense,
                                                   'target_max_result':a_target_max_dense,
                                                   'x_target_exceed':a_x_target_instances_dense,
                                                   'y_transfer_results':a_transfer_results_dense,
                                                   'y_target_results':a_target_results_dense,
                                                   'n_runs':n}})                
                if('dense' not in d[combo[::-1]]):
                    if(da_weighting is None):
                        d[combo[::-1]].update({'dense':{'no_weighting':{clf:{'transfer_avg_result':b_transfer_res_dense,
                                                         'target_max_result':b_target_max_dense,
                                                         'x_target_exceed':b_x_target_instances_dense,
                                                         'y_transfer_results':b_transfer_results_dense,
                                                         'y_target_results':b_target_results_dense,
                                                         'n_runs':n}}}})
                    else:
                        d[combo[::-1]].update({'dense':{da_weighting:{clf:{'transfer_avg_result':b_transfer_res_dense,
                                                         'target_max_result':b_target_max_dense,
                                                         'x_target_exceed':b_x_target_instances_dense,
                                                         'y_transfer_results':b_transfer_results_dense,
                                                         'y_target_results':b_target_results_dense,
                                                         'n_runs':n}}}})
                else:
                    if(da_weighting is None):
                        d[combo[::-1]]['dense']['no_weighting'].update({clf:{'transfer_avg_result':b_transfer_res_dense,
                                                         'target_max_result':b_target_max_dense,
                                                         'x_target_exceed':b_x_target_instances_dense,
                                                         'y_transfer_results':b_transfer_results_dense,
                                                         'y_target_results':b_target_results_dense,
                                                         'n_runs':n}})
                    else:
                        d[combo[::-1]]['dense'][da_weighting].update({clf:{'transfer_avg_result':b_transfer_res_dense,
                                                         'target_max_result':b_target_max_dense,
                                                         'x_target_exceed':b_x_target_instances_dense,
                                                         'y_transfer_results':b_transfer_results_dense,
                                                         'y_target_results':b_target_results_dense,
                                                         'n_runs':n}})
        # Update Progress Bar
        sup.printProgressBar(i + 1, l, prefix = 'Progress:', suffix = 'Complete', length = 50)
    return d
示例#5
0
def performSingleTLExp(source,target,source_target_name,target_train,target_test,estimators,features,da_weighting=None,n=10,switch_roles=True):
    """
    Backup function to perform single experiment.
    """
    x_instances = [10,14,20,24,28,32,38,44,50,60,70,80,90,100,120,140,160,180,200,300,500]
    
    d = {}
    
    l = len(estimators.keys())
    sup.printProgressBar(0, l, prefix = 'Progress:', suffix = 'Complete', length = 50)
    if(switch_roles):
        target_source_name = '{}_{}'.format(source_target_name.split('_')[1],source_target_name.split('_')[0])
        for i, clf in enumerate(estimators):
            a_transfer_results = []
            a_target_results = []
            b_transfer_results = []
            b_target_results = []
            
            for x in x_instances:
                # all features (also non-dense ones)
                # transfer from a to b 
                res = getF1SourceTargetFixedAvg(source,target,target_train,target_test,estimators[clf],features,da_weighting,x,n)
                a_transfer_results.append(res[0])
                a_target_results.append(res[1])
                # transfer from b to a
                res = getF1SourceTargetFixedAvg(target,source,target_train,target_test,estimators[clf],features,da_weighting,x,n)
                b_transfer_results.append(res[0])
                b_target_results.append(res[1])
            
            a_transfer_res = sum(a_transfer_results)/len(x_instances)
            a_target_max = max(a_target_results)
            b_transfer_res = sum(b_transfer_results)/len(x_instances)
            b_target_max = max(b_target_results)
            
            try:
                idx = np.argwhere(np.diff(np.sign(np.array(a_transfer_results) - np.array(a_target_results)))).flatten()[0]
                a_x_target_instances = x_instances[idx]
            except Exception:
                a_x_target_instances = np.nan
            try:
                idx = np.argwhere(np.diff(np.sign(np.array(b_transfer_results) - np.array(b_target_results)))).flatten()[0]
                b_x_target_instances = x_instances[idx]
            except Exception:
                b_x_target_instances = np.nan
            if(source_target_name not in d):
                if(da_weighting is None):
                    d.update({source_target_name:{'no_weighting':{clf:{'transfer_avg_result':a_transfer_res,
                                                                       'target_max_result':a_target_max,
                                                                       'x_target_exceed':a_x_target_instances,
                                                                       'y_transfer_results':a_transfer_results,
                                                                       'y_target_results':a_target_results,
                                                                       'n_runs':n}}}})
                else:
                    d.update({source_target_name:{da_weighting:{clf:{'transfer_avg_result':a_transfer_res,
                                                                       'target_max_result':a_target_max,
                                                                       'x_target_exceed':a_x_target_instances,
                                                                       'y_transfer_results':a_transfer_results,
                                                                       'y_target_results':a_target_results,
                                                                       'n_runs':n}}}})
            else:
                if(da_weighting is None):
                    d[source_target_name]['no_weighting'].update({clf:{'transfer_avg_result':a_transfer_res,
                                                                       'target_max_result':a_target_max,
                                                                       'x_target_exceed':a_x_target_instances,
                                                                       'y_transfer_results':a_transfer_results,
                                                                       'y_target_results':a_target_results,
                                                                       'n_runs':n}})
                else:
                    d[source_target_name][da_weighting].update({clf:{'transfer_avg_result':a_transfer_res,
                                                                       'target_max_result':a_target_max,
                                                                       'x_target_exceed':a_x_target_instances,
                                                                       'y_transfer_results':a_transfer_results,
                                                                       'y_target_results':a_target_results,
                                                                       'n_runs':n}})
            if(target_source_name not in d):
                if(da_weighting is None):
                    d.update({target_source_name:{'no_weighting':{clf:{'transfer_avg_result':b_transfer_res,
                                                                       'target_max_result':b_target_max,
                                                                       'x_target_exceed':b_x_target_instances,
                                                                       'y_transfer_results':b_transfer_results,
                                                                       'y_target_results':b_target_results,
                                                                       'n_runs':n}}}})
                else:
                    d.update({target_source_name:{da_weighting:{clf:{'transfer_avg_result':b_transfer_res,
                                                                       'target_max_result':b_target_max,
                                                                       'x_target_exceed':b_x_target_instances,
                                                                       'y_transfer_results':b_transfer_results,
                                                                       'y_target_results':b_target_results,
                                                                       'n_runs':n}}}})
            else:
                if(da_weighting is None):
                    d[target_source_name]['no_weighting'].update({clf:{'transfer_avg_result':b_transfer_res,
                                                                       'target_max_result':b_target_max,
                                                                       'x_target_exceed':b_x_target_instances,
                                                                       'y_transfer_results':b_transfer_results,
                                                                       'y_target_results':b_target_results,
                                                                       'n_runs':n}})
                else:
                    d[target_source_name][da_weighting].update({clf:{'transfer_avg_result':b_transfer_res,
                                                                       'target_max_result':b_target_max,
                                                                       'x_target_exceed':b_x_target_instances,
                                                                       'y_transfer_results':b_transfer_results,
                                                                       'y_target_results':b_target_results,
                                                                       'n_runs':n}})
            # Update Progress Bar
            sup.printProgressBar(i + 1, l, prefix = 'Progress:', suffix = 'Complete', length = 50)
    else:
        for i, clf in enumerate(estimators):
            a_transfer_results = []
            a_target_results = []

            for x in x_instances:
                # perform transfer learning experiments with source as source and target as target
                res = getF1SourceTargetFixedAvg(source,target,target_train,target_test,estimators[clf],features,da_weighting,x,n)
                a_transfer_results.append(res[0])
                a_target_results.append(res[1])

            a_transfer_res = sum(a_transfer_results)/len(x_instances)
            a_target_max = max(a_target_results)

            try:
                idx = np.argwhere(np.diff(np.sign(np.array(a_transfer_results) - np.array(a_target_results)))).flatten()[0]
                a_x_target_instances = x_instances[idx]
            except Exception:
                a_x_target_instances = np.nan
            if(source_target_name not in d):
                if(da_weighting is None):
                    d.update({source_target_name:{'no_weighting':{clf:{'transfer_avg_result':a_transfer_res,
                                                                       'target_max_result':a_target_max,
                                                                       'x_target_exceed':a_x_target_instances,
                                                                       'y_transfer_results':a_transfer_results,
                                                                       'y_target_results':a_target_results,
                                                                       'n_runs':n}}}})
                else:
                    d.update({source_target_name:{da_weighting:{clf:{'transfer_avg_result':a_transfer_res,
                                                                       'target_max_result':a_target_max,
                                                                       'x_target_exceed':a_x_target_instances,
                                                                       'y_transfer_results':a_transfer_results,
                                                                       'y_target_results':a_target_results,
                                                                       'n_runs':n}}}})
            else:
                if(da_weighting is None):
                    d[source_target_name]['no_weighting'].update({clf:{'transfer_avg_result':a_transfer_res,
                                                                       'target_max_result':a_target_max,
                                                                       'x_target_exceed':a_x_target_instances,
                                                                       'y_transfer_results':a_transfer_results,
                                                                       'y_target_results':a_target_results,
                                                                       'n_runs':n}})
                else:
                    d[source_target_name][da_weighting].update({clf:{'transfer_avg_result':a_transfer_res,
                                                                       'target_max_result':a_target_max,
                                                                       'x_target_exceed':a_x_target_instances,
                                                                       'y_transfer_results':a_transfer_results,
                                                                       'y_target_results':a_target_results,
                                                                       'n_runs':n}})
            # Update Progress Bar
            sup.printProgressBar(i + 1, l, prefix = 'Progress:', suffix = 'Complete', length = 50)
    return d
示例#6
0
def run_al(train_ds, test_ds, lbr, model, qs, quota, n_init_labeled):

    start_time = time.time()

    E_in, E_in_f1, E_out, E_out_f1, E_out_P, E_out_R = [], [], [], [], [], []

    model_pred_prob, model_feature_import, model_depth_tree = [], [], []
    labels = []

    for x in range(n_init_labeled):
        E_out_f1 = np.append(E_out_f1, 0.0)
        E_out_P = np.append(E_out_P, 0.0)
        E_out_R = np.append(E_out_R, 0.0)
        E_in_f1 = np.append(E_in_f1, 0.0)
        E_in = np.append(E_in, 0.0)

    X_test, y_test = test_ds.format_sklearn()

    model.train(train_ds)

    model_pred_prob.append(model.predict_proba(X_test))
    model_feature_import.append(model.feature_importances_())
    if (model.name == 'dt'):
        model_depth_tree.append(model.get_tree_max_depth())
    if (model.name == 'rf'):
        model_depth_tree.append(model.get_trees_max_depth())

    l = quota - n_init_labeled
    sup.printProgressBar(0,
                         l,
                         prefix='Progress:',
                         suffix='Complete',
                         length=50)

    for i in range(quota - n_init_labeled):
        # QBC
        ask_id = qs.make_query()
        # Labeler for QBC on train_ds
        lb = lbr.label(train_ds.data[ask_id][0])
        # QBC
        train_ds.update(ask_id, lb)
        labels.append(lb)

        model.train(train_ds)

        X_train_current, y_train_current = train_ds.format_sklearn()
        E_in = np.append(E_in, model.score(train_ds))
        E_in_f1 = np.append(
            E_in_f1,
            f1_score(y_train_current,
                     model.predict(X_train_current),
                     pos_label=1,
                     average='binary',
                     sample_weight=None))

        E_out = np.append(E_out, model.score(test_ds))
        prec, recall, f1score, support = precision_recall_fscore_support(
            y_test, model.predict(X_test), average='binary')

        # l = quota-n_init_labeled as defined for the progress_bar
        if (i == l - 1):
            model_pred_prob.append(model.predict_proba(X_test))
            model_feature_import.append(model.feature_importances_())
            if (model.name == 'dt'):
                model_depth_tree.append(model.get_tree_max_depth())
            if (model.name == 'rf'):
                model_depth_tree.append(model.get_trees_max_depth())

        E_out_f1 = np.append(E_out_f1, f1score)
        E_out_P = np.append(E_out_P, prec)
        E_out_R = np.append(E_out_R, recall)

        # Update Progress Bar
        sup.printProgressBar(i + 1,
                             l,
                             prefix='Progress:',
                             suffix='Complete',
                             length=50)

    runt = time.time() - start_time
    print('Runtime: {:.2f} seconds'.format(runt))

    return E_in, E_in_f1, E_out, E_out_f1, E_out_P, E_out_R, model, runt, model_pred_prob, model_feature_import, model_depth_tree
def returnSuperBMsInDict(candsets_train,
                         candsets_test,
                         estimators,
                         features,
                         progress_bar=True):
    """
    For each candest in candsets dictionary calculate the performance on hold-out test set when trained on training set of each
    using the estimators provided in estimators dictionary on the features specified in features argument.
    
    Candsets_train: dictionary of all training sets
    Candsets_test: dictionary of all test sets
    Estimators: dictionary of sklearn estimators that shall be used to train a classifier (Exp: {'logreg':LogisticRegression(),'dectree':DecisionTree()})
    features: list of features that shall be used
    Progress_bar: Boolean if progress bar shall be printed to track progress. Default: True
    
    Returns:
        dictionary with combinations as first keys and estimators as second keys.
        f1 and model_params are the final keys
    """
    d = {}

    if (progress_bar):
        l = len(candsets_train.keys())
        sup.printProgressBar(0,
                             l,
                             prefix='Progress:',
                             suffix='Complete',
                             length=50)
        for i, candset in enumerate(candsets_train.keys()):
            for clf in estimators:
                res, params = returnPassiveLearningResultsHoldoutSet(
                    estimators[clf], candsets_train[candset],
                    candsets_test[candset], features)
                if (candset not in d):
                    d.update({
                        '{}'.format(candset): {
                            clf: {
                                'f1': res,
                                'model_params': params
                            }
                        }
                    })
                else:
                    d[candset].update(
                        {clf: {
                            'f1': res,
                            'model_params': params
                        }})
            # Update Progress Bar
            sup.printProgressBar(i + 1,
                                 l,
                                 prefix='Progress:',
                                 suffix='Complete',
                                 length=50)
    else:
        for candset in candsets_train:
            for clf in estimators:
                res = returnPassiveLearningResultsHoldoutSet(
                    estimators[clf], candsets_train[candset],
                    candsets_test[candset], features)
                if (candset not in d):
                    d.update({
                        '{}'.format(candset): {
                            clf: {
                                'f1': res,
                                'model_params': params
                            }
                        }
                    })
                else:
                    d[candset].update(
                        {clf: {
                            'f1': res,
                            'model_params': params
                        }})

    return d