def default_friendly_and_malware(): ''' sample for dumpy friendly and malware ''' friendly_apks = get_data_directory("friendly_apk") malware_apks = get_data_directory("malware_apk") pscout_in = get_data_directory("training_data", "API_22", "API_22_parsed_api.csv") # friendly = make_apk_vector_folder(friendly_apks, pscout_in, extract_manifest_file=True, is_malware=False) malware = make_apk_vector_folder(malware_apks, pscout_in, extract_manifest_file=True, is_malware=True)
def parse_pscout_output(filename, api_lvl="API_22"): output_location = get_data_directory("training_data", api_lvl) if not os.path.exists(output_location): os.mkdir(output_location) output_file = os.path.join(output_location, api_lvl+"_parsed_api.csv") with open(filename) as pscout_input, open(output_file, "w") as output: raw_content = split_file = pscout_input.read() split_file = raw_content.split("\n") pscout_input.seek(0) line_numbers = [] for line_num, line in enumerate(pscout_input): if line.startswith("Permission:"): line_numbers.append(line_num) line_numbers.append(len(split_file)) results = [] for i in range(len(line_numbers)-1): permission_res = get_list_of_apis(split_file[line_numbers[i]+2:line_numbers[i+1]]) for index in range(len(permission_res)): permission_res[index] = [split_file[line_numbers[i]].split(".")[-1],] + permission_res[index] results += permission_res out_writer = csv.writer(output) out_writer.writerow(["Permission", "Function Name", "Return Value", "Arguments"]) out_writer.writerows(results)
def main(): argument = parse_arguments() apk_path = argument["input_path"] # "/Users/jeromemao/Desktop/EECS600/project/data/friendly_apk/1000_com.activefrequency.android.rockout.apk" api_lvl = argument["API_level"] pscout_in = get_data_directory("training_data", "API_{}".format(api_lvl), "API_{}_parsed_api.csv".format(api_lvl)) vector = make_apk_vector(apk_path, pscout_in, extract_manifest_file=True) if argument["retrain"]: training_model(feature_selection=argument["feature"]) classifier = get_classifier(API_lvl=api_lvl, classfier_type=argument["model_name"]) result = make_prediction(classifier, vector[:-1]) print("\n---------------Results for {:40}---------------\n".format( os.path.basename(apk_path))) print("{:20} : {:.4f} (0=NOT malware, 1=malware)".format( "Prediction Result", result["Prediction Result"][0])) print("{:20} : {:.4f} (Combined with the result above)".format( "Actual Value", result["Confidential Interval"][0]))
def encode(self): file_location = get_data_directory("permission_metadata", "permission_list.txt") with open(file_location) as permissions: temp_dict = dict() # print("PSCOUT", len(self.__pscout_readable_results)) for keys in self.__pscout_readable_results: temp_dict[keys] = ["0" for _ in range(256)] for func in self.__function_used: for keys in self.__pscout_readable_results: try: index = self.__pscout_readable_results[keys].index(func) # print(keys, index) temp_dict[keys][index] = "1" except (ValueError, IndexError): continue result_vector = [] for permission in permissions.read().split("\n")[:-1]: try: sliced = [temp_dict[permission][i:i+64] for i in range(0, 256, 64)] for num in sliced: result_vector.append(int("".join(num), base=2)) except KeyError: for _ in range(4): result_vector.append(0) return result_vector
def get_classifier(API_lvl=22, classfier_type="Random_Forest"): classfier_path = get_data_directory( "training_data", "API_{}".format(API_lvl), "API_{}_{}".format(API_lvl, classfier_type)) assert os.path.exists( classfier_path ), "Classifier does not exist. Do you want to train a new one?" with open(classfier_path, "rb") as pickle_can: clf = pickle.load(pickle_can) return clf
def training_model(API_level=22, dump_model=True, load_model=False, model_location=None, full_train=False, feature_selection="All", model_name="Random Forest", **model_param): if load_model: assert not (model_location is None), "You have to tell where the <pickled> model is" data_path = get_data_directory( "training_data", "API_{}".format(API_level), "API_{}_training_final.txt".format(API_level)) huge_data = np.loadtxt(data_path) if feature_selection == "All": X = huge_data[:, :-1] elif feature_selection == "Permission Only": X = huge_data[:, :150] elif feature_selection == "API Only": X = huge_data[:, 150:-1] y = huge_data[:, -1] if not full_train: X_train, X_test, Y_train, Y_test = train_test_split(X, y, shuffle=True, train_size=0.9) else: X_train, X_test, Y_train, Y_test = X, None, y, None k_fold = KFold(len(Y_train), n_folds=10, shuffle=True, random_state=0) if not load_model: if model_name == "Random Forest": clf = RandomForestClassifier(**model_param) elif model_name == "Neural Network": clf = MLPClassifier(**model_param) else: with open(model_location, "rb") as pickle_can: clf = pickle.load(pickle_can) score_list = cross_val_score(clf, X_train, Y_train, cv=k_fold, n_jobs=1, scoring="accuracy") print("[Model Name] ", model_name) print("[Trained with] {:5d} samples".format(len(X_train))) print("Ten Fold Cross Validation Accuracy : {:.2%} (+/-{:.2%})".format( np.mean(score_list), np.std(score_list))) clf.fit(X_train, Y_train) if not full_train: generate_roc_curve(clf, X_test, Y_test) if dump_model: pickle_to = get_data_directory( "training_data", "API_{}".format(API_level), "API_{}_{}".format(API_level, "_".join(model_name.split()))) with open(pickle_to, "wb") as pickle_can: pickle.dump(clf, pickle_can) return clf
def load_dangerous_permissions(): file_location = get_data_directory("permission_metadata", "dangerous_permission_list.txt") with open(file_location) as input_file: return input_file.read().split()
def load_list_of_permissions(): file_location = get_data_directory("permission_metadata", "permission_list.txt") with open(file_location) as permission_source: return permission_source.read().split()
sample for dumpy friendly and malware ''' friendly_apks = get_data_directory("friendly_apk") malware_apks = get_data_directory("malware_apk") pscout_in = get_data_directory("training_data", "API_22", "API_22_parsed_api.csv") # friendly = make_apk_vector_folder(friendly_apks, pscout_in, extract_manifest_file=True, is_malware=False) malware = make_apk_vector_folder(malware_apks, pscout_in, extract_manifest_file=True, is_malware=True) # dump_to = get_data_directory("training_data", "API_22", "API_22_training.txt") # np.savetxt(dump_to, friendly + malware) if __name__ == "__main__": pscout_in = get_data_directory("training_data", "API_22", "API_22_parsed_api.csv") r = make_apk_vector(sys.argv[1], pscout_in, extract_manifest_file=True, is_malware=False) print(" ".join([str(num) for num in r])) # gc.set_debug(gc.DEBUG_LEAK) # make_apk_vector_folder("/Users/jeromemao/Desktop/EECS600/project/data/friendly_apk", "")
def main(): db = pymysql.connect("localhost","mayank","mayank@25","Majorproject" ) print("\n*******************************************************************************************************************************************************") print("*************************************************************** Malware Analysis Tool ***************************************************************") print("*******************************************************************************************************************************************************") argument = parse_arguments() apk_path = argument["input_path"] print("\n\nCalculating MD5 Signature.......") s(2) md5 = hashlib.md5(open(apk_path,'rb').read()).hexdigest() print(md5) print("\nCalculating SHA1 Signature.......") s(2) s1 = "cat " + apk_path + "| shasum | tr -d ' -'" os.system(s1) print("\n*******************************************************************************************************************************************************") print("********************************************************* Performing Signature Based Analysis *******************************************************") print("*******************************************************************************************************************************************************\n\n") s(7) sigMalware = check(md5) if sigMalware == 0: print("Matching Signatures found in Database") print("Not a Malware") elif sigMalware == 1: print("Matching Signature found in Database") print("Definitely Malware") else: print("No Matching Signatures found in Database") print("Signature Based Analysis Failed\n\n") s(4) print("\n\n\n*******************************************************************************************************************************************************") print("********************************************************* Performing Permission Based Analysis ******************************************************") print("*******************************************************************************************************************************************************") print("\n\nGathering Permissions from AndroidManifest.xml for Analysis") print("Displaying Results") s(5) s2 = "perl apkperm.pl " + apk_path os.system(s2) print("\n*******************************************************************************************************************************************************") print("********************************************************* Performing API Calls Based Analysis *******************************************************") print("*******************************************************************************************************************************************************") print("\n\nGathering API Calls made by the Application for Analysis") print("Displaying Results\n") pth = "rm -rf " + apk_path.split(".")[0] os.system(pth) s(5) s3 = "perl apkapi.pl " + apk_path os.system(s3) argument = parse_arguments() apk_path = argument["input_path"] # "~/Desktop/EECS600/project/data/friendly_apk/1000_com.activefrequency.android.rockout.apk" api_lvl = argument["API_level"] pscout_in = get_data_directory("training_data", "API_{}".format(api_lvl), "API_{}_parsed_api.csv".format(api_lvl)) vector = make_apk_vector(apk_path, pscout_in, extract_manifest_file=True) if argument["retrain"]: training_model(feature_selection=argument["feature"]) classifier = get_classifier(API_lvl=api_lvl, classfier_type=argument["model_name"]) result = make_prediction(classifier, vector[:-1]) s(10) print("\n*******************************************************************************************************************************************************") print("\n************************************************************* Results for " + apk_path + " *************************************************************\n") print("*******************************************************************************************************************************************************") print("Default Conventions:\t0=NOT malware, 1=malware") print("Calculated Value : " + str(result["Confidential Interval"][0])) print("{:20} : {:.4f} ".format("Prediction Result", result["Prediction Result"][0])) if result["Prediction Result"][0] == 0: print("The Supplied APK Does not contain malware. Feel Free to install it") else: print("The Supplied APK contains malware. Do not install or Share")