예제 #1
0
def i_EM(P,
         U,
         max_imbalance=10.0,
         max_pos_ratio=1.0,
         tolerance=0.1,
         verbose=False):
    """all-in-one PU method: I-EM algorithm for positive set P and unlabelled set U

    iterate NB classifier with updated labels for unlabelled set (initially negative) until convergence
    if U is much larger than P, randomly samples max_imbalance*|P| docs from U"""

    print("Running I-EM")

    # if num_rows(U) > max_imbalance * num_rows(P):
    #     U = np.array(random.sample(list(U), int(max_imbalance * num_rows(P))))

    model = iterate_EM(P,
                       U,
                       tolerance=tolerance,
                       max_pos_ratio=max_pos_ratio,
                       clf_selection=False,
                       verbose=verbose)

    if verbose:
        train_report(model, P, U)

    return model
예제 #2
0
def biased_SVM_grid_search(P,
                           U,
                           Cs=None,
                           kernel='linear',
                           n_estimators=9,
                           verbose=False):
    if Cs is None:
        Cs = [10**x for x in range(-12, 12, 2)]

    if verbose:
        print(
            "Running Biased-SVM with balanced class weights and grid search over",
            len(Cs), "C values")

    model = BaggingClassifier(LinearSVC())

    grid_search = GridSearchCV(
        model,
        param_grid={
            'base_estimator__C': Cs,
            'base_estimator__class_weight': ['balanced'],
            ### not applicable for LinearSVC
            # 'base_estimator__kernel'      : [kernel],
            # 'base_estimator__cache_size'  : [8000],
            # 'base_estimator__probability' : [True],
            ### fit parameters for Bagging wrapper
            'bootstrap': [True],
            'n_estimators': [n_estimators],
            ### parallelization incompatible with multiprocessing
            # 'n_jobs'                      : [n_estimators]
        },
        scoring=pu_scorer,
        verbose=0)

    if verbose:
        print("Grid searching parameters for biased-SVM")
    X = concatenate((P, U))
    y = concatenate((ones(num_rows(P)), zeros(num_rows(U))))

    grid_search.fit(X, y)

    if verbose:
        train_report(grid_search.best_estimator_, P, U)
    print("Biased-SVM parameters:", grid_search.best_params_, "\tPU score:",
          grid_search.best_score_)

    return grid_search.best_estimator_
예제 #3
0
def standalone_rocchio(P, U, alpha=16, beta=4, verbose=False):
    """1-step Rocchio method"""

    print("Running Rocchio")

    if verbose:
        print("Building Rocchio model to determine Reliable Negative examples")
    model = rocchio(P, U, alpha=alpha, beta=beta)

    y_U = model.predict(U)

    U_minus_RN, RN = partition_pos_neg(U, y_U)
    if verbose:
        print("Reliable Negative examples in U:", num_rows(RN), "(",
              100 * num_rows(RN) / num_rows(U), "%)")
        train_report(model, P, U)

    return model
예제 #4
0
def cr_SVM(P,
           U,
           max_neg_ratio=0.1,
           noise_lvl=0.2,
           alpha=16,
           beta=4,
           kernel=None,
           C=0.1,
           verbose=False):
    """Two-Step technique based on Cosine Similarity, Rocchio and SVM

    Step 1.1: Find Potentially Negative docs (less similar to mean(P) than noise_lvl of docs in P)
    Step 1.2: Find Reliable Negative docs using Rocchio (similarity to mean positive/PN vector)
    Step 2: Iterate SVM starting from P and RN sets until classification of U converges

    noise level is quite crucial, should be >=20% to give reasonable results"""

    print("Running CR-SVM")

    # step 1
    if verbose:
        print("Determining RN using Cosine Similarity threshold and Rocchio\n")
    U_minus_RN, RN = get_RN_cosine_rocchio(P,
                                           U,
                                           noise_lvl=noise_lvl,
                                           alpha=alpha,
                                           beta=beta,
                                           verbose=verbose)

    # step2
    if verbose:
        print("\nIterating SVM with P, U-RN, and RN")
    model = iterate_SVM(P,
                        U_minus_RN,
                        RN,
                        kernel=kernel,
                        C=C,
                        max_neg_ratio=max_neg_ratio,
                        verbose=verbose)

    if verbose:
        train_report(model, P, U)

    return model
예제 #5
0
def s_EM(P,
         U,
         spy_ratio=0.1,
         max_pos_ratio=1.0,
         tolerance=0.1,
         noise_lvl=0.1,
         clf_selection=True,
         verbose=False):
    """S-EM two-step PU learning as described in \"Partially Supervised Classification...\".

    1st step: get Reliable Negative documents using Spy Documents
    2nd step: iterate EM with P, U-RN, and RN
    """

    print("Running S-EM")

    # P, U = arrays([P, U]

    # step 1
    if verbose:
        print(
            "Determining confidence threshold using Spy Documents and I-EM\n")
    U_minus_RN, RN = get_RN_Spy_Docs(P,
                                     U,
                                     spy_ratio=spy_ratio,
                                     tolerance=tolerance,
                                     noise_lvl=noise_lvl,
                                     verbose=verbose)

    # step2
    if verbose:
        print("\nIterating I-EM with P, U-RN, and RN")
    model = run_EM_with_RN(P,
                           U_minus_RN,
                           RN,
                           tolerance=tolerance,
                           max_pos_ratio=max_pos_ratio,
                           clf_selection=clf_selection,
                           verbose=verbose)

    if verbose:
        train_report(model, P, U)

    return model
예제 #6
0
def roc_EM(P,
           U,
           max_pos_ratio=0.5,
           tolerance=0.1,
           clf_selection=True,
           alpha=16,
           beta=4,
           verbose=False):
    """S-EM two-step PU learning as described in \"Partially Supervised Classification...\".

    1st step: get Reliable Negative documents using Spy Documents
    2nd step: iterate EM with P, U-RN, and RN
    """

    print("Running Roc-EM")

    # step 1
    if verbose:
        print("Determining RN using Rocchio method\n")
    U_minus_RN, RN = get_RN_rocchio(P,
                                    U,
                                    alpha=alpha,
                                    beta=beta,
                                    verbose=verbose)

    # step2
    if verbose:
        print("\nIterating I-EM with P, U-RN, and RN")
    model = run_EM_with_RN(P,
                           U_minus_RN,
                           RN,
                           tolerance=tolerance,
                           max_pos_ratio=max_pos_ratio,
                           clf_selection=clf_selection,
                           verbose=verbose)

    if verbose:
        train_report(model, P, U)

    return model
예제 #7
0
def roc_SVM(P,
            U,
            max_neg_ratio=0.1,
            alpha=16,
            beta=4,
            kernel=None,
            C=0.1,
            verbose=False):
    """Two-Step technique based on Rocchio and SVM

    Step 1: Find Reliable Negative docs using Rocchio (similarity to mean positive/unlabelled vector)
    Step 2: Iterate SVM starting from P and RN sets until classification of U converges"""

    print("Running Roc-SVM")

    # step 1
    if verbose:
        print("Determining RN using Rocchio method\n")
    U_minus_RN, RN = get_RN_rocchio(P,
                                    U,
                                    alpha=alpha,
                                    beta=beta,
                                    verbose=verbose)

    # step2
    if verbose:
        print("\nIterating SVM with P, U-RN, and RN")
    model = iterate_SVM(P,
                        U_minus_RN,
                        RN,
                        kernel=kernel,
                        C=C,
                        max_neg_ratio=max_neg_ratio,
                        verbose=verbose)

    if verbose:
        train_report(model, P, U)

    return model
예제 #8
0
def spy_SVM(P,
            U,
            spy_ratio=0.1,
            max_neg_ratio=0.1,
            tolerance=0.1,
            noise_lvl=0.1,
            verbose=False):
    """S-EM two-step PU learning as described in \"Partially Supervised Classification...\".

    1st step: get Reliable Negative documents using Spy Documents
    2nd step: iterate EM with P, U-RN, and RN
    """

    print("Running Spy-SVM")

    # step 1
    if verbose:
        print(
            "Determining confidence threshold using Spy Documents and I-EM\n")
    U_minus_RN, RN = get_RN_Spy_Docs(P,
                                     U,
                                     spy_ratio=spy_ratio,
                                     tolerance=tolerance,
                                     noise_lvl=noise_lvl,
                                     verbose=verbose)

    # step2
    if verbose:
        print("\nIterating SVM with P, U-RN, and RN")
    model = iterate_SVM(P,
                        U_minus_RN,
                        RN,
                        max_neg_ratio=max_neg_ratio,
                        verbose=verbose)

    if verbose:
        train_report(model, P, U)

    return model
예제 #9
0
def biased_SVM_weight_selection(P,
                                U,
                                Cs_neg=None,
                                Cs_pos_factors=None,
                                Cs=None,
                                kernel='linear',
                                test_size=0.2,
                                verbose=False):
    """run biased SVMs with combinations of class weight values, choose the one with the best pu_measure"""

    # default values
    if Cs is None:
        Cs = [10**x for x in range(-12, 12, 2)]
    if Cs_neg is None:
        Cs_neg = [1]  # arange(0.01, 0.63, 0.04)
    if Cs_pos_factors is None:
        Cs_pos_factors = range(1, 1100, 200)

    Cs = [(C, C_neg * j, C_neg) for C in Cs for C_neg in Cs_neg
          for j in Cs_pos_factors]

    if verbose:
        print(
            "Running Biased-SVM with range of C and positive class weight factors.",
            num_rows(Cs), "parameter combinations.")

    P_train, P_test = train_test_split(P, test_size=test_size)
    U_train, U_test = train_test_split(U, test_size=test_size)
    X = concatenate((P_train, U_train))
    y = concatenate((ones(num_rows(P_train)), zeros(num_rows(U_train))))

    # with Pool(processes=min(cpu_count() - 1, num_rows(Cs))) as p:
    score_weights = map(
        partial(eval_params,
                X_train=X,
                y_train=y,
                P_test=P_test,
                U_test=U_test,
                kernel=kernel), Cs)

    best_score_params = max(score_weights, key=lambda tup: tup[0])

    [print(s) for s in score_weights]
    if verbose:
        print("\nBest model has parameters", best_score_params[1],
              "and PU-score", best_score_params[0])
        print("Building final classifier")

    model = build_biased_SVM(concatenate((P, U)),
                             concatenate(
                                 (ones(num_rows(P)), zeros(num_rows(U)))),
                             C_pos=best_score_params[1]['C_pos'],
                             C_neg=best_score_params[1]['C_neg'],
                             C=best_score_params[1]['C'],
                             probability=True,
                             kernel=kernel)

    if verbose:
        train_report(model, P, U)
    print("Returning Biased-SVM with parameters", best_score_params[1],
          "and PU-score", best_score_params[0])
    return model