def creation(args): """ Function for the creationParser :param args: Namespace :return: nothing """ # --------------------- # Set different seeds # --------------------- random.seed(seed_value) np.random.seed(seed_value) tf.set_random_seed(seed_value) session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1, device_count={"CPU": 1}) sess = tf.Session(graph=tf.get_default_graph(), config=session_conf) K.set_session(sess) gan = GAN(lr=args.lr[0], sample=args.size[0]) gan.dataType = args.type[0] # Load dataset if args.dataset[0] == "amazon": dataset = AMAZON_TRAIN elif args.dataset[0] == "phishtank": dataset = PHISHTANK_TRAIN elif args.dataset[0] == "total": dataset = TOTAL_TRAIN else: dataset = args.dataset[0] if args.clean == "total" or args.clean[0] == "total": clean = TOTAL_TEST elif args.clean == "amazon" or args.clean[0] == "amazon": clean = AMAZON_TEST else: clean = args.clean[0] if args.phish == "phishtank" or args.phish[0] == "phishtank": phish = PHISHTANK_TEST else: phish = arg.phish[0] clean = list(importData.csv_to_list(clean)[1].values()) phish = list(importData.csv_to_list(phish)[1].values()) # Train then save gan.train(args.epochs[0], importData.csv_to_list(dataset)[1].values(), phishData=phish, cleanData=clean) gan.best_threshold_calculate(clean, phish, 0.0001, return_report=False) gan.save(args.name[0], args.location[0]) return
def prediction(args): """ Function for the predictParser :param args: Namespace :return: nothing """ # Load GAN model gan = GAN(0.1, 1) gan.load(args.name[0], args.location[0]) if args.file is not None: # Load data data = importData.csv_to_list(args.file[0])[1] for url in data.keys(): # Make a prediction results = gan.discriminator.predict_on_batch( np.array(data[url]).astype(np.float)[:].reshape(1, gan.countData, 1)) # Write results in the right place if args.verbose is True: if args.output == "console" or args.output[0] == "console": if results[0] < gan.thresHold: print(str(url) + " : " + str(results[0][0]) + " -> phishing") else: print(str(url) + " : " + str(results[0][0]) + " -> safe") else: with open(args.output[0], 'a', newline='') as outcsvfile: writer = csv.writer(outcsvfile, delimiter=' ', quotechar='"') if results[0] < gan.thresHold: writer.writerow([str(url) + " : {} -> phishing".format(results[0][0])]) else: writer.writerow([str(url) + " : {} -> safe".format(results[0][0])]) else: if args.output == "console" or args.output[0] == "console": if results[0] < gan.thresHold: print(str(url) + " -> phishing") else: print(str(url) + " -> safe") else: with open(args.output[0], 'a', newline='') as outcsvfile: writer = csv.writer(outcsvfile, delimiter=' ', quotechar='"') if results[0] < gan.thresHold: writer.writerow([str(url) + " -> phishing"]) else: writer.writerow([str(url) + " -> safe"]) return
def train(self, epochs, data, plotFrequency=20, predict=False, phishData=None, cleanData=None): """ Train the GAN :param epochs: int :param data: string (path to the dataset used to train the GAN) :param plotFrequency: int :param predict bool (if the training include prediction on test datasets) :param phishData: list of lists :param cleanData: list of lists :return: list of 7 list (to plot training/validation accuracy/loss of generator/discriminator) """ # Load the training dataset X_train = list(data) # Load testing datasets if phishData is None or cleanData is None: phisTest = list(importData.csv_to_list(PHIS_PATH_TEST)[1].values()) cleanTest = list( importData.csv_to_list(CLEAN_PATH_TEST)[1].values()) else: phisTest = list(phishData) cleanTest = list(cleanData) if len(cleanTest) > len(phisTest): cleanTest = cleanTest[:len(phisTest)] else: phisTest = phisTest[len(cleanTest)] # Adversarial ground truths valid = np.ones((self.sampleSize, 1)) fake = np.zeros((self.sampleSize, 1)) # Initialize list for the return values accuracy = [] Dloss = [] Gloss = [] vaccuracy = [] vDloss = [] vGloss = [] X = [] bestEpoch = -1 bestClass = {"accuracy": 0} for epoch in range(1, epochs + 1): # Select a random batch of images # for training idxt = np.random.randint(1, int(len(X_train) * 0.9), self.sampleSize) imgst = np.vstack(np.array(X_train)[idxt]) # for validation idxv = np.random.randint(int(len(X_train) * 0.9), len(X_train), self.sampleSize) imgsv = np.vstack(np.array(X_train)[idxv]) # --------------------- # Training # --------------------- noise = np.random.normal(0, 1, (self.sampleSize, self.countData)) # Generate a batch of new data for training gen_data = self.generator.predict(noise) # --------------------- # Train Discriminator # --------------------- d_loss_real = self.discriminator.train_on_batch( imgst.reshape(self.sampleSize, self.countData, 1), valid) d_loss_fake = self.discriminator.train_on_batch(gen_data, fake) d_loss = 0.5 * np.add(d_loss_real, d_loss_fake) # --------------------- # Train Generator # --------------------- noise = np.random.normal(0, 1, (self.sampleSize, self.countData)) # Train the generator (to have the discriminator label samples as valid) g_loss = self.combined.train_on_batch(noise, valid) # --------------------- # Validation # --------------------- noise = np.random.normal(0, 1, (self.sampleSize, self.countData)) # Generate a batch of new data for validation gen_data = self.generator.predict(noise) # --------------------- # Validate Discriminator # --------------------- vd_loss_real = self.discriminator.test_on_batch( imgsv.reshape(self.sampleSize, self.countData, 1), valid) vd_loss_fake = self.discriminator.test_on_batch(gen_data, fake) vd_loss = 0.5 * np.add(vd_loss_real, vd_loss_fake) # --------------------- # Validate Generator # --------------------- noise = np.random.normal(0, 1, (self.sampleSize, self.countData)) vg_loss = self.combined.test_on_batch(noise, valid) # Plot the progress if epoch % plotFrequency == 0: logger.info( "%d [D loss: %f, acc.: %.2f%%] [G loss: %f] [D vloss: %f, vacc.: %.2f%%] [G vloss: %f]" % (epoch, d_loss[0], 100 * d_loss[1], g_loss, vd_loss[0], 100 * vd_loss[1], vg_loss)) accuracy.append(d_loss[1]) X.append(epoch) Dloss.append(d_loss[0]) Gloss.append(g_loss) vaccuracy.append(vd_loss[1]) vDloss.append(vd_loss[0]) vGloss.append(vg_loss) # Generate the classificaiton report if necessary if predict: report = self.class_report(cleanTest, phisTest) if "accuracy" in report: if report["accuracy"] > bestClass["accuracy"]: bestClass = report bestEpoch = epoch del report del idxt, imgst, idxv, imgsv, noise, g_loss, gen_data, d_loss, d_loss_real, d_loss_fake, vd_loss_real, \ vd_loss, vd_loss_fake, vg_loss del X_train if not predict: self.class_report(cleanTest, phisTest, calculate=False) return X, accuracy, Dloss, Gloss, vaccuracy, vDloss, vGloss, bestClass, bestEpoch
def best_threshold_calculate(self, cleanTestPath, phishTestPath, step, return_report=True): """ Use to determine the best threshold for prediction :param cleanTestPath: str :param phishTestPath: str :param step: float :param return_report: bool :return: """ phisTest = list(importData.csv_to_list(phishTestPath)[1].values()) cleanTest = list(importData.csv_to_list(cleanTestPath)[1].values()) if len(cleanTest) > len(phisTest): cleanTest = cleanTest[:len(phisTest)] else: phisTest = phisTest[len(cleanTest)] # Construct the true results true = ["clean"] * len(cleanTest) + ["phish"] * len(phisTest) prediction = [] # --------------------- # Make prediction # --------------------- for i in cleanTest + phisTest: prediction.append( self.discriminator.predict_on_batch( np.array(i).astype(np.float)[:].reshape( 1, self.countData, 1))) averages = ((sum(prediction[:len(cleanTest)]) / len(cleanTest)), (sum(prediction[len(cleanTest):]) / len(phisTest))) mini = min(averages) maxi = max(averages) bestClass = {"accuracy": 0} print("Total of iteration :{}".format(len(np.arange(mini, maxi, step)))) for threshold in np.arange(mini, maxi, step): predict = [] for i in prediction: if self.dataType == "phish" and i[0][0] > threshold: predict.append("phish") elif self.dataType != "phish" and i[0][0] < threshold: predict.append("phish") else: predict.append("clean") report = classification_report(np.array(true), np.array(predict), output_dict=True) if report["accuracy"] > bestClass["accuracy"]: bestClass = report self.thresHold = threshold if return_report: return bestClass
def orm_extract(args): """ Function for the ORMExtractParser :param args: Namespace :return: nothing """ # Load database Base = databaseManage.WebsiteBase(args.database[0]) Base.create_tables() if type(args.thread) is list: args.thread = args.thread[0] # Load data URLs = list(importData.csv_to_list(args.path[0])[1].keys()) # --------------------- # Filter the results already in database # --------------------- alreadyIn = [] for url in Base.session.query(Base.__getattribute__(args.table[0])).all(): alreadyIn.append(url.url) for url in URLs: if "http://" in url[:7]: URLs[URLs.index(url)] = url[7:] elif "https://" in url[:8]: URLs[URLs.index(url)] = url[8:] URLs = set(URLs) for url in alreadyIn: try: URLs.remove(url) except KeyError: pass logger.info("{} websites will be added to the database".format(len(URLs))) itera = iter(URLs) URLs = zip(*[itera] * args.thread) # --------------------- # Add to the database # -------------------- dBase = databaseManage.NormalizationBase("DB/norm.db") normDict = {} for norm in dBase.session.query(dBase.Normalization).all(): normDict[norm.feature] = {"data": norm.data, "normalizer": norm.normalizer, "scaler": norm.scaler} i = 1 for url in URLs: logger.debug(str(i)) logger.info("Add : {}".format(url)) i += args.thread # Create URL object result1 = ThreadPool().map(Website.website, url) result2 = [] tmp = [] for web in result1: if web.html is None: result2.append(web) # result1.remove(web) else: tmp.append(web) if args.extraction: # Extract features fct = partial(Website.website.features_extraction, normDict=normDict) ThreadPool().map(fct, tmp) result2 += tmp for web in result2: print(web) # Add in database Base.adding(web, args.table[0]) else: for web in result1: # Add in database Base.adding(web, args.table[0]) if i % ((50 // args.thread) * args.thread) == 1 and i != 1: # Get new identity with tor with Controller.from_port(port=9051) as controller: controller.authenticate() controller.signal(Signal.NEWNYM)