Пример #1
0
def creation(args):
    """
        Function for the creationParser
        :param args: Namespace
        :return: nothing
        """
    # ---------------------
    #  Set different seeds
    # ---------------------
    random.seed(seed_value)
    np.random.seed(seed_value)
    tf.set_random_seed(seed_value)
    session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1,
                                  device_count={"CPU": 1})
    sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
    K.set_session(sess)
    gan = GAN(lr=args.lr[0], sample=args.size[0])
    gan.dataType = args.type[0]

    # Load dataset
    if args.dataset[0] == "amazon":
        dataset = AMAZON_TRAIN
    elif args.dataset[0] == "phishtank":
        dataset = PHISHTANK_TRAIN
    elif args.dataset[0] == "total":
        dataset = TOTAL_TRAIN
    else:
        dataset = args.dataset[0]

    if args.clean == "total" or args.clean[0] == "total":
        clean = TOTAL_TEST
    elif args.clean == "amazon" or args.clean[0] == "amazon":
        clean = AMAZON_TEST
    else:
        clean = args.clean[0]

    if args.phish == "phishtank" or args.phish[0] == "phishtank":
        phish = PHISHTANK_TEST
    else:
        phish = arg.phish[0]

    clean = list(importData.csv_to_list(clean)[1].values())
    phish = list(importData.csv_to_list(phish)[1].values())

    # Train then save
    gan.train(args.epochs[0], importData.csv_to_list(dataset)[1].values(), phishData=phish, cleanData=clean)
    gan.best_threshold_calculate(clean, phish, 0.0001, return_report=False)
    gan.save(args.name[0], args.location[0])
    return
Пример #2
0
def prediction(args):
    """
        Function for the predictParser
        :param args: Namespace
        :return: nothing
        """
    # Load GAN model
    gan = GAN(0.1, 1)
    gan.load(args.name[0], args.location[0])

    if args.file is not None:
        # Load data
        data = importData.csv_to_list(args.file[0])[1]
        for url in data.keys():
            # Make a prediction
            results = gan.discriminator.predict_on_batch(
                np.array(data[url]).astype(np.float)[:].reshape(1, gan.countData, 1))

            # Write results in the right place
            if args.verbose is True:
                if args.output == "console" or args.output[0] == "console":
                    if results[0] < gan.thresHold:
                        print(str(url) + " : " + str(results[0][0]) + " -> phishing")
                    else:
                        print(str(url) + " : " + str(results[0][0]) + " -> safe")

                else:
                    with open(args.output[0], 'a', newline='') as outcsvfile:
                        writer = csv.writer(outcsvfile, delimiter=' ', quotechar='"')
                        if results[0] < gan.thresHold:
                            writer.writerow([str(url) + " : {} -> phishing".format(results[0][0])])
                        else:
                            writer.writerow([str(url) + " : {} -> safe".format(results[0][0])])

            else:
                if args.output == "console" or args.output[0] == "console":
                    if results[0] < gan.thresHold:
                        print(str(url) + " -> phishing")
                    else:
                        print(str(url) + " -> safe")

                else:
                    with open(args.output[0], 'a', newline='') as outcsvfile:
                        writer = csv.writer(outcsvfile, delimiter=' ', quotechar='"')
                        if results[0] < gan.thresHold:
                            writer.writerow([str(url) + " -> phishing"])
                        else:
                            writer.writerow([str(url) + " -> safe"])
    return
Пример #3
0
    def train(self,
              epochs,
              data,
              plotFrequency=20,
              predict=False,
              phishData=None,
              cleanData=None):
        """
        Train the GAN
        :param epochs: int
        :param data: string (path to the dataset used to train the GAN)
        :param plotFrequency: int
        :param predict bool (if the training include prediction on test datasets)
        :param phishData: list of lists
        :param cleanData: list of lists
        :return: list of 7 list (to plot training/validation accuracy/loss of generator/discriminator)
        """

        # Load the training dataset
        X_train = list(data)

        # Load testing datasets
        if phishData is None or cleanData is None:
            phisTest = list(importData.csv_to_list(PHIS_PATH_TEST)[1].values())
            cleanTest = list(
                importData.csv_to_list(CLEAN_PATH_TEST)[1].values())
        else:
            phisTest = list(phishData)
            cleanTest = list(cleanData)

        if len(cleanTest) > len(phisTest):
            cleanTest = cleanTest[:len(phisTest)]
        else:
            phisTest = phisTest[len(cleanTest)]

        # Adversarial ground truths
        valid = np.ones((self.sampleSize, 1))
        fake = np.zeros((self.sampleSize, 1))

        # Initialize list for the return values
        accuracy = []
        Dloss = []
        Gloss = []
        vaccuracy = []
        vDloss = []
        vGloss = []
        X = []
        bestEpoch = -1
        bestClass = {"accuracy": 0}

        for epoch in range(1, epochs + 1):

            # Select a random batch of images
            # for training
            idxt = np.random.randint(1, int(len(X_train) * 0.9),
                                     self.sampleSize)
            imgst = np.vstack(np.array(X_train)[idxt])

            # for validation
            idxv = np.random.randint(int(len(X_train) * 0.9), len(X_train),
                                     self.sampleSize)
            imgsv = np.vstack(np.array(X_train)[idxv])

            # ---------------------
            #  Training
            # ---------------------

            noise = np.random.normal(0, 1, (self.sampleSize, self.countData))
            # Generate a batch of new data for training
            gen_data = self.generator.predict(noise)

            # ---------------------
            #  Train Discriminator
            # ---------------------
            d_loss_real = self.discriminator.train_on_batch(
                imgst.reshape(self.sampleSize, self.countData, 1), valid)
            d_loss_fake = self.discriminator.train_on_batch(gen_data, fake)
            d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

            # ---------------------
            #  Train Generator
            # ---------------------
            noise = np.random.normal(0, 1, (self.sampleSize, self.countData))

            # Train the generator (to have the discriminator label samples as valid)
            g_loss = self.combined.train_on_batch(noise, valid)

            # ---------------------
            #  Validation
            # ---------------------

            noise = np.random.normal(0, 1, (self.sampleSize, self.countData))
            # Generate a batch of new data for validation
            gen_data = self.generator.predict(noise)

            # ---------------------
            #  Validate Discriminator
            # ---------------------
            vd_loss_real = self.discriminator.test_on_batch(
                imgsv.reshape(self.sampleSize, self.countData, 1), valid)
            vd_loss_fake = self.discriminator.test_on_batch(gen_data, fake)
            vd_loss = 0.5 * np.add(vd_loss_real, vd_loss_fake)

            # ---------------------
            #  Validate Generator
            # ---------------------
            noise = np.random.normal(0, 1, (self.sampleSize, self.countData))
            vg_loss = self.combined.test_on_batch(noise, valid)

            # Plot the progress
            if epoch % plotFrequency == 0:
                logger.info(
                    "%d [D loss: %f, acc.: %.2f%%] [G loss: %f] [D vloss: %f, vacc.: %.2f%%] [G vloss: %f]"
                    % (epoch, d_loss[0], 100 * d_loss[1], g_loss, vd_loss[0],
                       100 * vd_loss[1], vg_loss))
                accuracy.append(d_loss[1])
                X.append(epoch)
                Dloss.append(d_loss[0])
                Gloss.append(g_loss)
                vaccuracy.append(vd_loss[1])
                vDloss.append(vd_loss[0])
                vGloss.append(vg_loss)

            # Generate the classificaiton report if necessary
            if predict:
                report = self.class_report(cleanTest, phisTest)

                if "accuracy" in report:
                    if report["accuracy"] > bestClass["accuracy"]:
                        bestClass = report
                        bestEpoch = epoch
                del report

            del idxt, imgst, idxv, imgsv, noise, g_loss, gen_data, d_loss, d_loss_real, d_loss_fake, vd_loss_real, \
                vd_loss, vd_loss_fake, vg_loss
        del X_train

        if not predict:
            self.class_report(cleanTest, phisTest, calculate=False)

        return X, accuracy, Dloss, Gloss, vaccuracy, vDloss, vGloss, bestClass, bestEpoch
Пример #4
0
    def best_threshold_calculate(self,
                                 cleanTestPath,
                                 phishTestPath,
                                 step,
                                 return_report=True):
        """
        Use to determine the best threshold for prediction
        :param cleanTestPath: str
        :param phishTestPath: str
        :param step: float
        :param return_report: bool
        :return:
        """

        phisTest = list(importData.csv_to_list(phishTestPath)[1].values())
        cleanTest = list(importData.csv_to_list(cleanTestPath)[1].values())

        if len(cleanTest) > len(phisTest):
            cleanTest = cleanTest[:len(phisTest)]
        else:
            phisTest = phisTest[len(cleanTest)]

        # Construct the true results
        true = ["clean"] * len(cleanTest) + ["phish"] * len(phisTest)
        prediction = []

        # ---------------------
        #  Make prediction
        # ---------------------
        for i in cleanTest + phisTest:
            prediction.append(
                self.discriminator.predict_on_batch(
                    np.array(i).astype(np.float)[:].reshape(
                        1, self.countData, 1)))

        averages = ((sum(prediction[:len(cleanTest)]) / len(cleanTest)),
                    (sum(prediction[len(cleanTest):]) / len(phisTest)))
        mini = min(averages)
        maxi = max(averages)

        bestClass = {"accuracy": 0}

        print("Total of iteration :{}".format(len(np.arange(mini, maxi,
                                                            step))))
        for threshold in np.arange(mini, maxi, step):
            predict = []
            for i in prediction:
                if self.dataType == "phish" and i[0][0] > threshold:
                    predict.append("phish")
                elif self.dataType != "phish" and i[0][0] < threshold:
                    predict.append("phish")
                else:
                    predict.append("clean")

            report = classification_report(np.array(true),
                                           np.array(predict),
                                           output_dict=True)
            if report["accuracy"] > bestClass["accuracy"]:
                bestClass = report
                self.thresHold = threshold

        if return_report:
            return bestClass
Пример #5
0
def orm_extract(args):
    """
        Function for the ORMExtractParser
        :param args: Namespace
        :return: nothing
        """

    # Load database
    Base = databaseManage.WebsiteBase(args.database[0])
    Base.create_tables()

    if type(args.thread) is list:
        args.thread = args.thread[0]

    # Load data
    URLs = list(importData.csv_to_list(args.path[0])[1].keys())

    # ---------------------
    #  Filter the results already in database
    # ---------------------
    alreadyIn = []
    for url in Base.session.query(Base.__getattribute__(args.table[0])).all():
        alreadyIn.append(url.url)

    for url in URLs:
        if "http://" in url[:7]:
            URLs[URLs.index(url)] = url[7:]
        elif "https://" in url[:8]:
            URLs[URLs.index(url)] = url[8:]

    URLs = set(URLs)

    for url in alreadyIn:
        try:
            URLs.remove(url)
        except KeyError:
            pass
    logger.info("{} websites will be added to the database".format(len(URLs)))
    itera = iter(URLs)
    URLs = zip(*[itera] * args.thread)

    # ---------------------
    #  Add to the database
    # --------------------
    dBase = databaseManage.NormalizationBase("DB/norm.db")
    normDict = {}
    for norm in dBase.session.query(dBase.Normalization).all():
        normDict[norm.feature] = {"data": norm.data, "normalizer": norm.normalizer, "scaler": norm.scaler}

    i = 1
    for url in URLs:
        logger.debug(str(i))
        logger.info("Add : {}".format(url))
        i += args.thread

        # Create URL object
        result1 = ThreadPool().map(Website.website, url)
        result2 = []
        tmp = []
        for web in result1:
            if web.html is None:
                result2.append(web)
                # result1.remove(web)
            else:
                tmp.append(web)
        if args.extraction:
            # Extract features
            fct = partial(Website.website.features_extraction, normDict=normDict)
            ThreadPool().map(fct, tmp)
            result2 += tmp
            for web in result2:
                print(web)
                # Add in database
                Base.adding(web, args.table[0])
        else:
            for web in result1:
                # Add in database
                Base.adding(web, args.table[0])

        if i % ((50 // args.thread) * args.thread) == 1 and i != 1:
            # Get new identity with tor
            with Controller.from_port(port=9051) as controller:
                controller.authenticate()
                controller.signal(Signal.NEWNYM)