示例#1
0
def main():
    f_feature_lists = Path("feature_lists.pickle")
    f_app_names = Path("app_names.pickle")
    if f_feature_lists.exists() and f_app_names.exists():
        logger.info("Found pickle files for feature_lists and app_names")
        with f_feature_lists.open(mode="rb") as f:
            feature_lists = pickle.load(f)
        with f_app_names.open(mode="rb") as f:
            app_names = pickle.load(f)
    else:
        app_names, feature_lists = get_features(config.get("dataset", "path"))
        with f_feature_lists.open(mode="wb") as f:
            pickle.dump(feature_lists, f, pickle.HIGHEST_PROTOCOL)
        with f_app_names.open(mode="wb") as f:
            pickle.dump(app_names, f, pickle.HIGHEST_PROTOCOL)

    features, labels, feature_names = vectorize_features(feature_lists,
                                                         label=config.get(
                                                             "dataset",
                                                             "label"))

    sparse.save_npz("features.npz", features)
    sparse.save_npz("labels.npz", labels)
    with Path("feature_names.pickle").open(mode="wb") as f:
        pickle.dump(feature_names, f, pickle.HIGHEST_PROTOCOL)
示例#2
0
def compare_features():
    """
    Generate files with chi2 and mutual information scores for every features.

    These files can be read using pd.read_csv to see the values and select the best K features. This is not automated.
    """

    selectors = ([
        SelectKBest(chi2, k=20),
        SelectKBest(mutual_info_classif, k=20)
    ])

    feature_names = load_feature_names(FEATURE_NAMES_PICKLE)
    all_features = sparse.load_npz(FEATURES_NPZ)
    labels = sparse.load_npz(LABELS_NPZ)
    labels = labels.toarray().ravel()
    labels[labels != 1] = -1
    X_train, X_test, y_train, y_test = train_test_split(all_features,
                                                        labels,
                                                        test_size=0.5,
                                                        stratify=labels)

    for selector in selectors:
        selector_name = selector.score_func.__name__
        logger.info(f"Applying selector {selector_name}")
        new_features = selector.fit(X_train, y_train)

        df = pd.DataFrame.from_dict({
            "name": feature_names,
            "score": selector.scores_,
            "pvalue": selector.pvalues_
        })
        selector_name = selector.score_func.__name__
        df.to_csv(f"{selector_name}_features.csv")
示例#3
0
    def select_malicious(self, baseline_model, base_models, weights):
        y = 1
        self.y_malicious = y
        malware = self.client_y_test == y
        X = self.client_X_test
        N = self.client_N_test

        X = X[malware, :]
        N = np.array(list(itertools.compress(self.client_N_test, malware)))

        permutation = np.random.permutation(X.shape[0])
        X = X[permutation, :]
        N = N[permutation]

        baseline_pred, candidate_pred = safew_base_predictions(
            baseline_model=baseline_model, base_models=base_models, X=X)

        def close_to_clean(sample, candidates):
            allies = candidates == -1
            # logger.info(f"Sum: {weights[allies].sum()}")
            return 0.3 <= weights[allies].sum() <= 0.4

        for i, sample in enumerate(X):
            candidates = candidate_pred[i, :]
            if close_to_clean(sample, candidates):
                allies = candidates == -1
                logger.info(
                    f"Found malicious sample: allied sum = {weights[allies].sum()}"
                )
                self.X_malicious = X[i, :].reshape(1, -1)  # Only one instance
                self.N_malicious = N[i]
                return
示例#4
0
    def select_features(self, X_train, X_test, X_preinstalled):
        selector = self.feature_selector

        new_X_train = selector.transform(X_train)
        new_X_test = selector.transform(X_test)
        new_X_preinstalled = selector.transform(X_preinstalled)
        logger.info(
            f"From {X_train.shape[1]} to {new_X_train.shape[1]} dimensions using {selector}"
        )

        return new_X_train, new_X_test, new_X_preinstalled
示例#5
0
def main(path="results"):
    for f in Path(path).glob("**/results.csv"):
        df = pd.read_csv(f)
        fps = sum(df["membership_fp"] > 0)
        if fps > 0:
            logger.info(
                f"There are {fps} false positives in the membership inference attack"
            )

        tps = sum(df["membership_fp"] > 0)
        if tps > 0:
            logger.info(
                f"There are {tps} true positives in the membership inference attack"
            )
示例#6
0
def test_all(models):
    X = lim_data.cloud_X_test
    y_true = lim_data.cloud_y_test

    for classifier in models:
        model = models[classifier]
        y_pred = model.predict(X)
        logger.info(scores(classifier, y_true, y_pred))

        baseline = classifier
        y_pred, weights = safew(
            baseline_model=models[baseline],
            base_models=[
                models[classifier] for classifier in models
                if classifier is not baseline
            ],
            X=X,
        )

        logger.info(scores(f"SAFEW baseline {baseline}", y_true, y_pred))
示例#7
0
def categories_table(selected_features, n_features=[100, 200, 500]):
    category_features = get_features_in_categories()
    counts_dict = {}

    for n in n_features:
        row_counts = {category: 0 for category in category_features}
        for feature in selected_features[:n]:
            for category in category_features:
                if feature in category_features[category]:
                    row_counts[category] += 1
            occurrences = sum(
                [feature in category_features[c] for c in category_features])
            categories = [
                c for c in category_features if feature in category_features[c]
            ]
            if occurrences > 1:
                logger.info(
                    f"Feature {feature} is in {occurrences} categories: {categories}"
                )
        counts_dict[n] = row_counts

    df = pd.DataFrame(data=counts_dict)
    return df
示例#8
0
    def run_federation(self):
        logger.info(f"Round {self.federation_round}")
        self.create_cloud()
        if self.adversarial_proportion > 0:
            # Simulate a blank slate adversarial client
            client = self.create_client_worker([0, True])
            self.data.select_malicious(baseline_model=self.baseline_model,
                                       base_models=self.base_models,
                                       weights=client.weights)

        # Let adversarial clients find an malicious app in later rounds
        self.create_clients()

        while self.federation_round < self.n_rounds:
            self.federation_round += 1
            logger.info(f"Round {self.federation_round}")
            self.federate_cloud()
            self.federate_clients()
            df_tmp = results.to_df()
            df_tmp.to_csv("results_tmp.csv")
            evaluate_lim.plot_cloud(df_tmp)
            evaluate_lim.plot_clients(df_tmp)

        return results.to_df()
示例#9
0
    def initialize(self):
        self.all_features = sparse.load_npz(self.features_npz)
        normal_apps_ind, preinstalled_ind = self.normal_preinstalled_ind(
            self.all_features)

        with Path(self.app_names_pickle).open(mode="rb") as f:
            app_names = pickle.load(f)
        self.N = app_names

        self.labels = self.labels()
        logger.info(f"Total clean apps = {np.sum(self.labels == -1)}")
        logger.info(f"Total malware apps = {np.sum(self.labels == 1)}")
        X_preinstalled = self.all_features.tocsr()[preinstalled_ind, :]
        self.y_preinstalled = self.labels[preinstalled_ind]

        self.N_preinstalled = list(itertools.compress(self.N,
                                                      preinstalled_ind))

        logger.info(f"random state = {self.random_state}")
        X_train, X_test, self.y_train, self.y_test, self.N_train, self.N_test = train_test_split(
            self.all_features,
            self.labels,
            self.N,
            test_size=self.unlabeled_data_proportion,
            random_state=self.random_state)
        self.feature_selector = self.fit_selector()
        self.X_train, self.X_test, self.X_preinstalled = self.select_features(
            X_train,
            X_test,
            X_preinstalled,
        )
        self.client_X_test, cloud_X_test, self.client_y_test, cloud_y_test, self.client_N_test, cloud_N_test = train_test_split(
            self.X_test,
            self.y_test,
            self.N_test,
            test_size=self.client_unlabeled_proportion,
            random_state=self.random_state,
        )

        def share_testing_examples(X, y, N, n):
            max_index = X.shape[0]
            indices = np.random.randint(max_index, size=n)

            X_shared = X[indices, :]
            y_shared = y[indices]
            N_shared = list(itertools.compress(N, indices))
            return X_shared, y_shared, N_shared

        X_client, y_client, N_client = share_testing_examples(
            self.client_X_test, self.client_y_test, self.client_N_test, n=1000)
        self.cloud_X_test = sparse.vstack((cloud_X_test, X_client))
        self.cloud_y_test = np.concatenate((cloud_y_test, y_client))
        self.cloud_N_test = cloud_N_test + N_client
示例#10
0
def experiment(
    name,
    baseline_model=BASELINE_MODEL,
    base_models=BASE_MODELS,
    top_k=50,
    top_k_features=20,
    n_rounds=50,
    n_clients=500,
    p_install=0.6,
    p_malware=0.1,
    unlabeled_data_proportion=0.8,
    client_unlabeled_proportion=0.8,
    k_best_features=100,
    adversarial_proportion=0.5,
    n_max_apps_per_round=5,
):

    logger.info(f"Name of the experiment: {name}")
    logger.info(f"Baseline model: {baseline_model}")
    logger.info(f"Base models: {base_models}")
    logger.info(f"K best features: {k_best_features}")
    logger.info(f"Number of popular apps: {top_k}")
    logger.info(f"Number of rounds: {n_rounds}")
    logger.info(f"Number of clients: {n_clients}")
    logger.info(f"Install an app with probability {p_install}")
    logger.info(f"Install a malware app with probability {p_malware}")
    logger.info(f"Top k features: {top_k_features}")
    logger.info(f"Proportion of testing data: {unlabeled_data_proportion}")
    logger.info(
        f"Proportion of testing data for clients: {client_unlabeled_proportion}"
    )

    feature_selector = feature_selection.SelectKBest(feature_selection.chi2,
                                                     k=k_best_features)
    lim_data = LiMData(unlabeled_data_proportion=unlabeled_data_proportion,
                       client_unlabeled_proportion=client_unlabeled_proportion,
                       top_k=top_k,
                       top_k_features=top_k_features,
                       feature_selector=feature_selector,
                       random_state=42).get()
    lim = LiM(
        data=lim_data,
        baseline_model=baseline_model,
        base_models=base_models,
        n_rounds=n_rounds,
        n_clients=n_clients,
        p_install=p_install,
        p_malware=p_malware,
        adversarial_proportion=adversarial_proportion,
        n_max_apps_per_round=n_max_apps_per_round,
    )
    df = lim.run_federation()
    df.to_csv("results.csv")

    with pathlib.Path("report.txt").open("w") as f:
        f.write(simple_report(df))
    logger.info(f"{simple_report(df)}")

    plot_cloud(df)
    plot_clients(df)

    move_results(name)
    results.clear()
    gc.collect()
示例#11
0
def simple_report(df):
    errors = []
    Line = namedtuple(
        "Line", ["error", "place", "classifier", "average", "std", "total"])
    df["precision"] = df.tp / (df.tp + df.fp)
    df["recall"] = df.tp / (df.tp + df.fn)

    for place in ["client", "cloud"]:
        client = df[df.place == place]
        for classifier in df.classifier.unique():
            classifier_df = client[client.classifier == classifier]
            line = Line(
                error="False positives",
                place=place,
                classifier=classifier,
                # weights=classifier_df.weights,
                average=classifier_df.groupby(
                    by="federation_round").fp.mean().mean(),
                std=classifier_df.groupby(
                    by="federation_round").fp.std().mean(),
                total=classifier_df.fp.mean(),
            )
            errors.append(line)

            with_malware = classifier_df[classifier_df.tp +
                                         classifier_df.fn > 0]
            if with_malware.shape[0] > 0:
                # line = Line(error="Poisoned",
                #             place=place,
                #             classifier=classifier,
                #             average=classifier_df[["federation_round", "poisoned"]].astype(int).groupby(by="federation_round").poisoned.sum().mean(),
                #             std=classifier_df.groupby(by="federation_round").poisoned.sum().std(),
                #             total=classifier_df.poisoned.astype(int).sum().mean(),)
                # errors.append(line)

                line = Line(
                    error="Recall",
                    place=place,
                    classifier=classifier,
                    # weights=classifier_df.weights,
                    average=with_malware.groupby(
                        by="federation_round").recall.mean().mean(),
                    std=with_malware.groupby(
                        by="federation_round").recall.std().mean(),
                    total=(with_malware.tp + with_malware.fp).mean(),
                )
                errors.append(line)

                line = Line(
                    error="Overall performance (F1 score)",
                    place=place,
                    classifier=classifier,
                    # weights=classifier_df.weights,
                    average=with_malware.groupby(
                        by="federation_round").f1.mean().mean(),
                    std=with_malware.groupby(
                        by="federation_round").f1.std().mean(),
                    total=(with_malware.tp + with_malware.fn +
                           with_malware.fp + with_malware.tn).mean(),
                )
                errors.append(line)
    report = pd.DataFrame(data=errors)
    for error in report.error.unique():
        logger.info(f"{error}\n{report[report.error == error]}")

    return report.to_string()
示例#12
0
 def get(self):
     if not self.initialized():
         logger.info("Initializing the LiM data")
         self.initialize()
     return self