def create_random_sets(): print("Generating TRAINING set...") training_set = onehot.generate_set( set_size, malware_ratio) # generate random training set print("Generating TRAINING input...") data, labels = onehot.generate_input( training_set, total_features) # perform one-hot encoding print("Generating TESTING set...") testing_set = onehot.generate_set( testing_set_size, malware_ratio) # generate random testing set print("Generating TESTING input...") test_data, test_labels = onehot.generate_input( testing_set, total_features) # perform one-hot encoding return data, labels, test_data, test_labels # return train data - labels and test data - labels
def create_sets(): if os.path.isfile("training_set_8500.txt") is False: set_size = 8500 malware_ratio = 0.3 print("Creating data-labels...") print("Generating TESTING set...") training_set = onehot.generate_set( set_size, malware_ratio) # generate random testing set with open("training_set_1500.txt", "w") as file: for item in training_set: file.write(str(item) + "\n") if os.path.isfile("testing_set_8500.txt") is False: set_size = 8500 malware_ratio = 0.3 print("Creating data-labels...") print("Generating TESTING set...") testing_set = onehot.generate_set( set_size, malware_ratio) # generate random testing set with open("testing_set_1500.txt", "w") as file: for item in testing_set: file.write(str(item) + "\n") training_set = [] testing_set = [] with open( "training_set_8500.txt", "r" ) as file: # read training set file and append applications to list for line in file: line.strip() # remove whitespace line = line[:-1] # remove \n training_set.append(line) # add item to list with open( "testing_set_8500.txt", "r" ) as file: # read testing set file and append applications to list for line in file: line.strip() line = line[:-1] testing_set.append(line) print("Generating TRAINING input...") data, labels = onehot.generate_input( training_set, total_features) # perform one-hot encoding print("Generating TESTING input...") test_data, test_labels = onehot.generate_input( testing_set, total_features) # perform one-hot encoding return data, labels, test_data, test_labels
def create_random_sets(set_size=1500, malware_ratio=0.3): print("Generating set...") testing_set = onehot.generate_set(set_size, malware_ratio) # generate random set print("Generating input...") # shuffle the set randomly and perform one-hot encoding test_data, test_labels = onehot.generate_input(testing_set, total_features) return test_data, test_labels
def create_set(): if os.path.isfile("testing_set_200.txt") is False: set_size = 200 malware_ratio = 0.5 print("Creating data-labels...") print("Generating TESTING set...") testing_set = onehot.generate_set( set_size, malware_ratio) # generate random testing set with open("testing_set_200.txt", "w") as file: for item in testing_set: file.write(str(item) + "\n") testing_set = [] # the list of testing set with open( "testing_set_200.txt", "r" ) as file: # read testing set file and append applications to list for line in file: line.strip() line = line[:-1] testing_set.append(line) print("Generating TESTING input...") test_data, test_labels = onehot.generate_input( testing_set, total_features) # perform one-hot encoding return test_data, test_labels
if __name__ == "__main__": total_features = 545333 # total unique features set_size = 2000 # set site that will be used to create random training set testing_set_size = 2000 # set site that will be used to create random test set malware_ratio = 0.3 # malware ratio in the set size onehot.create_list_of_apps() # function from set_one_encoding.py # check if a predefined training if os.path.isfile("training_set_2000.txt") is False and os.path.isfile( "testing_set_2000.txt") is False: print("Creating data-labels...") print("Generating TRAINING set...") training_set = onehot.generate_set( set_size, malware_ratio) # generate random training set with open("training_set_2000.txt", "w") as file: for item in training_set: file.write(str(item) + "\n") print("Generating TESTING set...") testing_set = onehot.generate_set( testing_set_size, malware_ratio) # generate random testing set with open("testing_set_2000.txt", "w") as file: for item in testing_set: file.write(str(item) + "\n") training_set = [] # the list of training set testing_set = [] # the list of testing set with open(