def feature_extraction(self, apk_paths, is_ordering=True): """ feature extraction @param apk_paths: the list of applications @param is_ordering: return the list of features corresponds to the apk_paths """ feature_save_dir = os.path.join("/tmp", "apk_data") if os.path.exists(feature_save_dir): # delete the files related to features shutil.rmtree(feature_save_dir, ignore_errors=True) # a loosely checking # file_number = len(os.listdir(feature_save_dir)) # assert file_number == len(apk_paths), "Feature extraction halts: there are feature files in directory '{}', and please remove it if it is not necessary anymore".format(feature_save_dir) get_droid_feature(apk_paths, feature_save_dir, feature_type=self.feature_tp) feature_mapping = FeatureMapping(feature_save_dir, feature_type=self.feature_tp) if is_ordering: feature = feature_mapping.preprocess_feature( is_ordering, apk_paths) else: feature = feature_mapping.preprocess_feature() if not os.path.exists( config.get('feature.' + self.feature_tp, 'vocabulary')): logger.warning("No vocabulary.") return np.array([]) vocab = utils.read_pickle( config.get('feature.' + self.feature_tp, 'vocabulary')) if self.feature_mp == 'count': return feature_mapping.count_feature_mapping_normalized( vocab, feature) else: return feature_mapping.binary_feature_mapping_normalized( vocab, feature)
def feature_extraction(self, apk_paths, inorder=True): feat_save_dir = os.path.join("/tmp", "apk_data") if os.path.exists(feat_save_dir): shutil.rmtree(feat_save_dir) get_droid_feature(apk_paths, feat_save_dir, feature_type=self.feature_tp) feature_mapping = FeatureMapping(feat_save_dir, feature_type=self.feature_tp) if inorder: feature = feature_mapping.preprocess_feature(inorder, apk_paths) else: feature = feature_mapping.preprocess_feature() if not os.path.exists(os.path.join(self.save_dir, 'vocabulary')): logger.info("No vocabulary.") return np.array([]) vocab = utils.read_pickle(os.path.join(self.save_dir, 'vocabulary')) if self.feature_mp == 'count': return feature_mapping.count_feature_mapping_normalized( vocab, feature) else: return feature_mapping.binary_feature_mapping_normalized( vocab, feature)
def _data_preprocess(self): """ feature extraction """ if (not os.path.exists(self.ben_dir)) and (not os.path.exists( self.mal_dir)): logger.error("directory '{}' or '{}' has no APK data.".format( self.ben_dir, self.mal_dir)) return try: label_dict = self.get_label_dict() data_root_dir = config.get("dataset", "dataset_root") feat_save_dir = os.path.join(data_root_dir, "apk_data") get_droid_feature(self.ben_dir, feat_save_dir, feature_type=self.feature_tp) get_droid_feature(self.mal_dir, feat_save_dir, feature_type=self.feature_tp) feature_mapping = FeatureMapping(feat_save_dir, feature_type=self.feature_tp) naive_features, name_list = feature_mapping.load_features() if len(naive_features) == 0: logger.error("No features extracted.") return # remove S6: used permissions, this type of features depend on feature 'S7' APIs and feature 'S2' permission if not self.info.use_interdependent_features: naive_features = feature_mapping.remove_interdependent_featrues( naive_features) gt_label = np.array([label_dict[os.path.splitext(name.strip())[0]] \ for name in name_list]) vocab, vocab_info_dict, features = feature_mapping.generate_vocab( naive_features) # feature splitting as training dataset, validation dataset, testing dataset train_features, test_features, train_y, test_y, train_name_list, test_name_list = \ train_test_split(features, gt_label, name_list, test_size=0.2, random_state=0) train_features, val_features, train_y, val_y, train_name_list, val_name_list = \ train_test_split(train_features, train_y, train_name_list, test_size=0.25, random_state=0) # select frequent features vocab_selected, vocab_info_dict_selcted = \ feature_mapping.select_feature(train_features, train_y, vocab, vocab_info_dict, dim=10000) MSG = "After feature selection, the feature number is {} vs. {}".format( len(vocab_selected), len(vocab)) logger.info(msg=MSG) if self.feature_mp == 'count': training_feature_vectors = \ feature_mapping.count_feature_mapping_normalized(vocab_selected, train_features, status='train') val_feature_vectors = \ feature_mapping.count_feature_mapping_normalized(vocab_selected, val_features) test_feature_vectors = \ feature_mapping.count_feature_mapping_normalized(vocab_selected, test_features) elif self.feature_mp == 'binary': training_feature_vectors = \ feature_mapping.binary_feature_mapping_normalized(vocab_selected, train_features, status='train') val_feature_vectors = \ feature_mapping.binary_feature_mapping_normalized(vocab_selected, val_features) test_feature_vectors = \ feature_mapping.binary_feature_mapping_normalized(vocab_selected, test_features) else: raise ValueError("Not supported") # save features and feature representations utils.dump_pickle( vocab_selected, config.get('feature.' + self.feature_tp, 'vocabulary')) utils.dump_pickle( vocab_info_dict_selcted, config.get('feature.' + self.feature_tp, 'vocab_info')) utils.dump_joblib([ training_feature_vectors, val_feature_vectors, test_feature_vectors ], config.get('feature.' + self.feature_tp, 'dataX')) utils.dump_joblib([train_y, val_y, test_y], config.get('feature.' + self.feature_tp, 'datay')) utils.write_whole_file( '\n'.join(train_name_list + val_name_list + test_name_list), config.get('dataset', 'name_list')) except Exception as ex: logger.error(str(ex)) sys.exit(1)
def _data_preprocess(self): if (not os.path.exists(self.ben_dir)) and (not os.path.exists( self.mal_dir)): logger.error("directory '{}' or '{}' has no APK data.".format( self.ben_dir, self.mal_dir)) return try: label_dict = self.get_label_dict() data_root_dir = config.get("dataset", "dataset_root") feat_save_dir = os.path.join(data_root_dir, "apk_data") get_droid_feature(self.ben_dir, feat_save_dir, feature_type=self.feature_tp) get_droid_feature(self.mal_dir, feat_save_dir, feature_type=self.feature_tp) feature_mapping = FeatureMapping(feat_save_dir, feature_type=self.feature_tp) naive_features, name_list = feature_mapping.load_features() gt_label = np.array([label_dict[os.path.splitext(name.strip())[0]] \ for name in name_list]) if len(naive_features) == 0: logger.error("No features extracted.") return if not self.info.use_interdependent_features: naive_features = feature_mapping.remove_interdependent_featrues( naive_features) vocab, vocab_info_dict, feat_purified = feature_mapping.generate_vocab( naive_features) # feature splitting as training dataset, validation dataset, testing dataset train_features, test_features, train_y, test_y, train_name_list, test_name_list = \ train_test_split(feat_purified, gt_label, name_list, test_size=0.2, random_state=0) train_features, val_features, train_y, val_y, train_name_list, val_name_list = \ train_test_split(train_features, train_y, train_name_list, test_size=0.25, random_state=0) # select features vocab_selected, vocab_info_dict_selcted = \ feature_mapping.select_feature(train_features, train_y, vocab, vocab_info_dict, dim=10000) # feature preprocessing based on the feature utility rate if abs(self.feature_utility_rate - 1.) < 1e-10: naive_features = naive_features elif self.feature_utility_rate > 0. and self.feature_utility_rate < 1.: # todo pass else: raise ValueError if self.feature_mp == 'count': training_feature_vectors = \ feature_mapping.count_feature_mapping_normalized(vocab_selected, train_features, status='train') val_feature_vectors = \ feature_mapping.count_feature_mapping_normalized(vocab_selected, val_features) test_feature_vectors = \ feature_mapping.count_feature_mapping_normalized(vocab_selected, test_features) else: training_feature_vectors = \ feature_mapping.binary_feature_mapping_normalized(vocab_selected, train_features, status='train') val_feature_vectors = \ feature_mapping.binary_feature_mapping_normalized(vocab_selected, val_features) test_feature_vectors = \ feature_mapping.binary_feature_mapping_normalized(vocab_selected, test_features) utils.dump_pickle(vocab_selected, os.path.join(self.save_dir, 'vocabulary')) utils.dump_pickle(vocab_info_dict_selcted, os.path.join(self.save_dir, 'vocab_info')) utils.dump_joblib([ training_feature_vectors, val_feature_vectors, test_feature_vectors ], os.path.join(self.save_dir, 'dataX')) utils.dump_joblib([train_y, val_y, test_y], os.path.join(self.save_dir, 'datay')) utils.write_whole_file('\n'.join(name_list), os.path.join(self.save_dir, 'name_list')) except KeyError as ex: logger.error(str(ex)) sys.exit(1) except Exception as ex: logger.error(str(ex)) sys.exit(1)