예제 #1
0
    def feature_extraction(self, apk_paths, is_ordering=True):
        """
        feature extraction
        @param apk_paths: the list of applications
        @param is_ordering: return the list of features corresponds to the apk_paths
        """
        feature_save_dir = os.path.join("/tmp", "apk_data")

        if os.path.exists(feature_save_dir):
            # delete the files related to features
            shutil.rmtree(feature_save_dir, ignore_errors=True)
            # a loosely checking
            # file_number = len(os.listdir(feature_save_dir))
            # assert file_number == len(apk_paths), "Feature extraction halts: there are feature files in directory '{}', and please remove it if it is not necessary anymore".format(feature_save_dir)

        get_droid_feature(apk_paths,
                          feature_save_dir,
                          feature_type=self.feature_tp)
        feature_mapping = FeatureMapping(feature_save_dir,
                                         feature_type=self.feature_tp)
        if is_ordering:
            feature = feature_mapping.preprocess_feature(
                is_ordering, apk_paths)
        else:
            feature = feature_mapping.preprocess_feature()
        if not os.path.exists(
                config.get('feature.' + self.feature_tp, 'vocabulary')):
            logger.warning("No vocabulary.")
            return np.array([])
        vocab = utils.read_pickle(
            config.get('feature.' + self.feature_tp, 'vocabulary'))

        if self.feature_mp == 'count':
            return feature_mapping.count_feature_mapping_normalized(
                vocab, feature)
        else:
            return feature_mapping.binary_feature_mapping_normalized(
                vocab, feature)
    def feature_extraction(self, apk_paths, inorder=True):
        feat_save_dir = os.path.join("/tmp", "apk_data")
        if os.path.exists(feat_save_dir):
            shutil.rmtree(feat_save_dir)
        get_droid_feature(apk_paths,
                          feat_save_dir,
                          feature_type=self.feature_tp)
        feature_mapping = FeatureMapping(feat_save_dir,
                                         feature_type=self.feature_tp)
        if inorder:
            feature = feature_mapping.preprocess_feature(inorder, apk_paths)
        else:
            feature = feature_mapping.preprocess_feature()
        if not os.path.exists(os.path.join(self.save_dir, 'vocabulary')):
            logger.info("No vocabulary.")
            return np.array([])
        vocab = utils.read_pickle(os.path.join(self.save_dir, 'vocabulary'))

        if self.feature_mp == 'count':
            return feature_mapping.count_feature_mapping_normalized(
                vocab, feature)
        else:
            return feature_mapping.binary_feature_mapping_normalized(
                vocab, feature)
예제 #3
0
    def _data_preprocess(self):
        """
        feature extraction
        """
        if (not os.path.exists(self.ben_dir)) and (not os.path.exists(
                self.mal_dir)):
            logger.error("directory '{}' or '{}' has no APK data.".format(
                self.ben_dir, self.mal_dir))
            return
        try:
            label_dict = self.get_label_dict()

            data_root_dir = config.get("dataset", "dataset_root")
            feat_save_dir = os.path.join(data_root_dir, "apk_data")
            get_droid_feature(self.ben_dir,
                              feat_save_dir,
                              feature_type=self.feature_tp)
            get_droid_feature(self.mal_dir,
                              feat_save_dir,
                              feature_type=self.feature_tp)

            feature_mapping = FeatureMapping(feat_save_dir,
                                             feature_type=self.feature_tp)
            naive_features, name_list = feature_mapping.load_features()

            if len(naive_features) == 0:
                logger.error("No features extracted.")
                return

            # remove S6: used permissions, this type of features depend on feature 'S7' APIs and feature 'S2' permission
            if not self.info.use_interdependent_features:
                naive_features = feature_mapping.remove_interdependent_featrues(
                    naive_features)

            gt_label = np.array([label_dict[os.path.splitext(name.strip())[0]] \
                                 for name in name_list])

            vocab, vocab_info_dict, features = feature_mapping.generate_vocab(
                naive_features)

            # feature splitting as training dataset, validation dataset, testing dataset
            train_features, test_features, train_y, test_y, train_name_list, test_name_list = \
                train_test_split(features, gt_label, name_list, test_size=0.2, random_state=0)
            train_features, val_features, train_y, val_y, train_name_list, val_name_list = \
                train_test_split(train_features, train_y, train_name_list, test_size=0.25, random_state=0)

            # select frequent features
            vocab_selected, vocab_info_dict_selcted = \
                feature_mapping.select_feature(train_features, train_y, vocab, vocab_info_dict, dim=10000)
            MSG = "After feature selection, the feature number is {} vs. {}".format(
                len(vocab_selected), len(vocab))
            logger.info(msg=MSG)

            if self.feature_mp == 'count':
                training_feature_vectors = \
                    feature_mapping.count_feature_mapping_normalized(vocab_selected, train_features, status='train')
                val_feature_vectors = \
                    feature_mapping.count_feature_mapping_normalized(vocab_selected, val_features)
                test_feature_vectors = \
                    feature_mapping.count_feature_mapping_normalized(vocab_selected, test_features)
            elif self.feature_mp == 'binary':
                training_feature_vectors = \
                    feature_mapping.binary_feature_mapping_normalized(vocab_selected, train_features, status='train')
                val_feature_vectors = \
                    feature_mapping.binary_feature_mapping_normalized(vocab_selected, val_features)
                test_feature_vectors = \
                    feature_mapping.binary_feature_mapping_normalized(vocab_selected, test_features)
            else:
                raise ValueError("Not supported")

            # save features and feature representations
            utils.dump_pickle(
                vocab_selected,
                config.get('feature.' + self.feature_tp, 'vocabulary'))
            utils.dump_pickle(
                vocab_info_dict_selcted,
                config.get('feature.' + self.feature_tp, 'vocab_info'))
            utils.dump_joblib([
                training_feature_vectors, val_feature_vectors,
                test_feature_vectors
            ], config.get('feature.' + self.feature_tp, 'dataX'))
            utils.dump_joblib([train_y, val_y, test_y],
                              config.get('feature.' + self.feature_tp,
                                         'datay'))

            utils.write_whole_file(
                '\n'.join(train_name_list + val_name_list + test_name_list),
                config.get('dataset', 'name_list'))
        except Exception as ex:
            logger.error(str(ex))
            sys.exit(1)
    def _data_preprocess(self):
        if (not os.path.exists(self.ben_dir)) and (not os.path.exists(
                self.mal_dir)):
            logger.error("directory '{}' or '{}' has no APK data.".format(
                self.ben_dir, self.mal_dir))
            return

        try:
            label_dict = self.get_label_dict()

            data_root_dir = config.get("dataset", "dataset_root")
            feat_save_dir = os.path.join(data_root_dir, "apk_data")
            get_droid_feature(self.ben_dir,
                              feat_save_dir,
                              feature_type=self.feature_tp)
            get_droid_feature(self.mal_dir,
                              feat_save_dir,
                              feature_type=self.feature_tp)

            feature_mapping = FeatureMapping(feat_save_dir,
                                             feature_type=self.feature_tp)
            naive_features, name_list = feature_mapping.load_features()
            gt_label = np.array([label_dict[os.path.splitext(name.strip())[0]] \
                                 for name in name_list])

            if len(naive_features) == 0:
                logger.error("No features extracted.")
                return

            if not self.info.use_interdependent_features:
                naive_features = feature_mapping.remove_interdependent_featrues(
                    naive_features)

            vocab, vocab_info_dict, feat_purified = feature_mapping.generate_vocab(
                naive_features)

            # feature splitting as training dataset, validation dataset,  testing dataset
            train_features, test_features, train_y, test_y, train_name_list, test_name_list = \
                train_test_split(feat_purified, gt_label, name_list, test_size=0.2, random_state=0)
            train_features, val_features, train_y, val_y, train_name_list, val_name_list = \
                train_test_split(train_features, train_y, train_name_list, test_size=0.25, random_state=0)

            # select features
            vocab_selected, vocab_info_dict_selcted = \
                feature_mapping.select_feature(train_features, train_y, vocab, vocab_info_dict, dim=10000)

            # feature preprocessing based on the feature utility rate
            if abs(self.feature_utility_rate - 1.) < 1e-10:
                naive_features = naive_features
            elif self.feature_utility_rate > 0. and self.feature_utility_rate < 1.:
                # todo
                pass
            else:
                raise ValueError

            if self.feature_mp == 'count':
                training_feature_vectors = \
                    feature_mapping.count_feature_mapping_normalized(vocab_selected, train_features, status='train')
                val_feature_vectors = \
                    feature_mapping.count_feature_mapping_normalized(vocab_selected, val_features)
                test_feature_vectors = \
                    feature_mapping.count_feature_mapping_normalized(vocab_selected, test_features)
            else:
                training_feature_vectors = \
                    feature_mapping.binary_feature_mapping_normalized(vocab_selected, train_features, status='train')
                val_feature_vectors = \
                    feature_mapping.binary_feature_mapping_normalized(vocab_selected, val_features)
                test_feature_vectors = \
                    feature_mapping.binary_feature_mapping_normalized(vocab_selected, test_features)

            utils.dump_pickle(vocab_selected,
                              os.path.join(self.save_dir, 'vocabulary'))
            utils.dump_pickle(vocab_info_dict_selcted,
                              os.path.join(self.save_dir, 'vocab_info'))
            utils.dump_joblib([
                training_feature_vectors, val_feature_vectors,
                test_feature_vectors
            ], os.path.join(self.save_dir, 'dataX'))
            utils.dump_joblib([train_y, val_y, test_y],
                              os.path.join(self.save_dir, 'datay'))
            utils.write_whole_file('\n'.join(name_list),
                                   os.path.join(self.save_dir, 'name_list'))
        except KeyError as ex:
            logger.error(str(ex))
            sys.exit(1)

        except Exception as ex:
            logger.error(str(ex))
            sys.exit(1)