Пример #1
0
def write_signature_features_CSV(feature_list, file_name=""):
    """
    write features for signature filter to csv
    """

    if not file_name:
        file_name = SIGNATURE_FEATURE_DATABASE

    id = 0

    with open(DATA_PATH + file_name, 'w') as file:
        writer = csv.writer(file, delimiter=',')
        writer.writerow(SIGNATURE_FEATURE_LIST_COLUMN_NAMES)

        for entry in feature_list:
            if not entry == None:
                writer.writerow([
                    id, entry.url, entry.final_url, entry.label,
                    entry.cert_subject, entry.ent1, entry.ent2, entry.ent3,
                    entry.ent4, entry.ent5, entry.term1, entry.term2,
                    entry.term3, entry.term4, entry.term5
                ])

                id += 1

    log(action_logging_enum=INFO,
        logging_text="Feature list written to CSV file. [{f}]".format(
            f=file_name))
Пример #2
0
def do_search(domain, suffix, url, words):
    search_string = ""

    search_string = "{d}.{s}".format(d=domain, s=suffix)

    for word in words:
        if not word == None:
            search_string += " {w}".format(w=word)

    try:
        result = bingsearch.search(search_string, wait_after_429=False)
    except Exception as e:
        result = -1
        log(action_logging_enum=WARNING, logging_text=str(e))

    if result == -1:
        return None

    for entry in result:
        extracted_search = get_url_components(str(entry))
        search_domain = extracted_search[3]
        search_cctld = extracted_search[4]
        found_url = str(entry)

        if str(search_domain).__eq__(domain) and str(suffix).__eq__(str(search_cctld)) :

            log(action_logging_enum=INFO,
                logging_text="Login page found by search engine.")
            return found_url, True

    return url, False
Пример #3
0
def save_model(knn):

    with open(SAVED_MODEL_FILE, 'wb') as file:
        pickle.dump(knn, file)

    log(action_logging_enum=INFO,
        logging_text="[K-NEAREST NEIGHBOR]: Model saved.")
Пример #4
0
def load_model():
    # load model from file
    logistic_regression = pickle.load(open(SAVED_MODEL_FILE, 'rb'))

    log(action_logging_enum=INFO, logging_text="[LOGISTIC REGRESSION]: Model loaded.")

    return logistic_regression
Пример #5
0
def train_model(do_optimize=False, data=pd.DataFrame()):
    log_module_start(MODULE_NAME=MODEL_NAME)
    pd.set_option('display.max_columns', None)

    if len(data) == 0:
        data = pd.DataFrame(pd.read_csv(DATA_PATH + LEXICAL_FEATURE_DATABASE))
        data = transform_data(data)

    data = data.replace("True", 1)
    data = data.replace("False", 0)
    y = data['Label'].copy()
    x = data.drop(["Label"], axis=1).copy()
    train, test = train_test_split(data, test_size=0.35)
    log(action_logging_enum=INFO,
        logging_text="[K-NEAREST NEIGHBOR] Data ready for use.")

    y_train = train['Label'].copy()
    x_train = train.drop(["Label"], axis=1).copy()

    params = {'n_neighbor': 9}

    knn = KNeighborsClassifier()  # params)
    f1 = print_scores(knn, x, y)
    log(action_logging_enum=INFO,
        logging_text="[K-NEAREST NEIGHBOR] Starting training.")

    knn.fit(x_train, y_train)
    save_model(knn=knn)
    log_module_complete(MODULE_NAME=MODEL_NAME)

    return f1
Пример #6
0
    def neighborhood_entropy(r, s=None, union=False):
        # get neighborhood entropy for subsets R and S or UNION of R and S

        feature_set = r
        sum_list = []

        if union:
            feature_set = pd.concat([feature_set, s], axis=1)

        n = len(feature_set)

        def do_get(i_):
            g_e = get_neighborhood(feature_set, i_, size_row, distance,
                                   feature_vec)
            return math.log((g_e / n), 2)

        try:
            feature_vec = feature_set.values.tolist()
            size_row = len(feature_set)
            distance = 0.15
            for i in range(n):
                sum_list.append(do_get(i))

        except Exception as e:
            log(ERROR, str(e))

        sum = 0

        for i in sum_list:
            sum += i

        return (sum * -(1 / n))
Пример #7
0
def predict_url(url):
    try:
        if isinstance(url, str):
            features = mod_feature_extraction.extract_features_from_URL(url, "PREDICT", True)
            x_pred = pd.DataFrame(features)
        else:
            x_pred = url

        x_pred = transform_data(x_pred)


        random_forest = load_model()
        y_pred = random_forest.predict(x_pred)

        return y_pred.tolist()

    except Exception as e:
        exc_type, exc_obj, tb = sys.exc_info()
        f = tb.tb_frame
        lineno = tb.tb_lineno
        filename = f.f_code.co_filename
        linecache.checkcache(filename)
        line = linecache.getline(filename, lineno, f.f_globals)
        print('EXCEPTION IN ({}, LINE {} "{}"): {}'.format(filename, lineno, line.strip(), exc_obj))
        log(action_logging_enum=WARNING, logging_text=str(e))
        log(action_logging_enum=WARNING, logging_text=str(e.__traceback__))
        return None
Пример #8
0
    def check_status():

        nonlocal complete
        nonlocal failed
        nonlocal size_before

        while True:
            entry = q.get()
            url = entry.url
            is_up = False
            try:
                response = requests.get(url, timeout=10, headers=headers)
                response.raise_for_status()

                if response.status_code >= 200 and response.status_code < 400:
                    is_up = True
            except Exception as e:
                log(action_logging_enum=ERROR, logging_text=str(e))
                is_up = False

            if is_up:
                complete += 1
                log(action_logging_enum=INFO,
                    logging_text="Found {} of {}. (Failed: {})".format(
                        complete, size_before, failed))
                modify_list(entry)
            else:
                failed += 1

            q.task_done()
Пример #9
0
def extract_features_from_signature_list(data):
    """
        extract all signature features from list with urls and labels
    """
    @ray.remote
    def get_feature_entry(entry):

        new_entry = extract_features_from_signature(entry.url, entry.label)

        if not new_entry == None:
            return new_entry

    try:
        result_ids = []
        for entry in data:
            result_ids.append(get_feature_entry.remote(entry))

        feature_list = ray.get(result_ids)

    except KeyboardInterrupt as e:
        log(action_logging_enum=ERROR,
            logging_text=
            "Process interrupted by keyboard signal. Returning the list.")
        feature_list = ray.get(result_ids)

        return feature_list

    return feature_list
Пример #10
0
def save_model(adaptive_boosting):

    with open(SAVED_MODEL_FILE, 'wb') as file:
        pickle.dump(adaptive_boosting, file)

    log(action_logging_enum=INFO,
        logging_text="[ADAPTIVE BOOSTING]: Model saved.")
Пример #11
0
def extract_features_from_website_list_ray(data):
    """
        extract all features from website list, ray used for parallelism
    """
    @ray.remote
    def get_feature_entry(entry):

        new_entry = extract_features_from_website(entry.url, entry.label,
                                                  False)

        if not new_entry == None:
            return new_entry

    try:
        result_ids = []
        for entry in data:
            result_ids.append(get_feature_entry.remote(entry))

        feature_list = ray.get(result_ids)

    except KeyboardInterrupt as e:
        log(action_logging_enum=ERROR,
            logging_text=
            "Process interrupted by keyboard signal. Returning the list.")
        feature_list = ray.get(result_ids)

        return feature_list

    return feature_list
Пример #12
0
def create_brand_list_by_alexa():
    alexa_list = open_dataset_XML_file(filename=ALEXA_FILE,
                                       iterateable="entry",
                                       label="Benign",
                                       url_label="url",
                                       label_label="label")

    brand_list = []

    for entry in alexa_list:
        url = entry.url

        brand_list.append(tldextract.extract(url).domain)

    root = et.Element("data")
    tree = et.ElementTree(root)

    for i in range(len(brand_list)):
        entry = et.SubElement(root, "entry")
        entrylabel = et.SubElement(entry, "brandname")
        entrylabel.text = str(brand_list[i])

    tree.write(open(DATA_PATH + BRAND_FILE, 'wb'), pretty_print=True)
    log(action_logging_enum=INFO,
        logging_text="Write process to XML finished for [{f}].".format(
            f=BRAND_FILE))
Пример #13
0
def train_model(do_optimize=False, data=pd.DataFrame()):
    log_module_start(MODULE_NAME=MODEL_NAME)

    if len(data) == 0:
        data = pd.read_csv(DATA_PATH + LEXICAL_FEATURE_DATABASE)
        data = transform_data(data)

    train, test = train_test_split(data, test_size=0.2)
    pd.set_option('display.max_columns', None)
    y = data['Label']
    x = data.drop(['Label'], axis=1).values
    log(action_logging_enum=INFO,
        logging_text="[ADAPTIVE BOOSTING] Data ready for use.")

    if do_optimize == True:
        optimize()

    params = {'n_estimators': 120, 'random_state': 0}

    log(action_logging_enum=INFO,
        logging_text="[ADAPTIVE BOOSTING] Starting training.")
    adaptive_boosting = AdaBoostClassifier()  #params)
    f1 = print_scores(adaptive_boosting, x, y)

    y_train = train['Label']
    x_train = train.drop(['Label'], axis=1)

    # random_state=1, n_estimators=120, min_samples_leaf=5, min_samples_split=10,
    #                                    max_features='sqrt', max_depth=17

    adaptive_boosting.fit(x_train, y_train)
    save_model(adaptive_boosting=adaptive_boosting)
    log_module_complete(MODULE_NAME=MODEL_NAME)

    return f1
Пример #14
0
def optimize():
    log(action_logging_enum=INFO,
        logging_text="[SUPPORT VECTOR MACHINE]: Starting to search for best c, gamma, kernel by cross validating different values.")

    # read data
    train = pd.read_csv(DATA_PATH + LEXICAL_FEATURE_DATABASE)
    train = transform_data(train)

    x_train = train.drop(['Label'], axis=1)
    y_train = train['Label'].copy()

    param_grid = [
        {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
        {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
    ]

    # Create grid search using 5-fold cross validation
    model = SVC()
    clf = GridSearchCV(model, param_grid, cv=10, verbose=0)
    best_model = clf.fit(x_train, y_train)

    # View best hyperparameters
    log(action_logging_enum=INFO, logging_text="[SUPPORT VECTOR MACHINE]: Hyperparameter tuning completed.")
    log(INFO, str('Best Kernel:', best_model.best_estimator_.get_params()['kernel']))
    log(INFO, str('Best C:', best_model.best_estimator_.get_params()['C']))
    log(INFO, str('Best gamma:', best_model.best_estimator_.get_params()['gamma']))
Пример #15
0
def save_model(extreme_gradient):
    with open(SAVED_MODEL_FILE, 'wb') as file:
        pickle.dump(extreme_gradient, file)

    log(action_logging_enum=INFO,
        logging_text="[EXTREME GRADIENT BOOSTING]: Model saved.")
    return
Пример #16
0
def load_model():
    # load model from file
    support_vector_machine = pickle.load(open(SAVED_MODEL_FILE, 'rb'))

    log(action_logging_enum=INFO, logging_text="[SUPPORT VECTOR MACHINE]: Model loaded.")

    return support_vector_machine
Пример #17
0
def load_model():
    # load model from file
    decision_tree = pickle.load(open(SAVED_MODEL_FILE, 'rb'))
    log(action_logging_enum=INFO,
        logging_text="[DECISION TREE]: Model loaded.")

    return decision_tree
Пример #18
0
    def get_feature_contribution_subset(feature_, y_, coalition_, coal_size):
        # get contribution to coalition for feature_ (delta for feature i and coalition K_i)

        mutual_information = neighbor_mutual_information_coalition(
            coalition_, y_, feature_)

        if not mutual_information >= 0:
            log(INFO, "Return contribution.")
            return 0

        log(INFO, "Contribution bigger than 0.")

        # check for all features S in coalition and feature_ R -> NMI(S ; D | R) > NMI(S ; D)

        related = 0
        is_related = False
        for i in range(coal_size):
            nmi_cj_y_ci = neighbor_mutual_information_coalition(
                coalition_.iloc[:, i], y_, feature_)
            nmi_cj_y = neighbor_mutual_information_coalition(
                coalition_.iloc[:, i], y_, less=True)

            if nmi_cj_y_ci > nmi_cj_y:
                related += 1

        if related >= (coal_size / 2):
            log(INFO, "Contribution is related.")
            is_related = True

        if not is_related:
            log(INFO, "Return contribution.")
            return 0

        log(INFO, "Return contribution.")
        return 1
Пример #19
0
def get_input_to_lower():
    prompt = input("type a command:")
    prompt = prompt.lower()

    # log system commands
    if prompt != "":
        log(action_logging_enum=INFO, logging_text="--> " + prompt)

    return prompt
Пример #20
0
    def do_get(i_, y, x, combinations, feature, c, lene):

        res = get_feature_contribution_subset(y_=y,
                                              coalition_=x.iloc[:,
                                                                combinations],
                                              feature_=feature,
                                              coal_size=len(combinations))
        log(INFO, "Processed coalition {} of {}.".format(c, lene))
        return res
Пример #21
0
def load_model():
    try:
        # load model from file
        random_forest = pickle.load(open(SAVED_MODEL_FILE, 'rb'))

        log(action_logging_enum=INFO, logging_text="[RANDOM FOREST]: Model loaded.")

        return random_forest
    except Exception:
        return None
Пример #22
0
def load_model():

    # load model from file
    try:
        adaptive_boosting = pickle.load(open(SAVED_MODEL_FILE, 'rb'))
    except Exception:
        return None
    log(action_logging_enum=INFO, logging_text="[ADAPTIVE BOOSTING]: Model loaded.")

    return adaptive_boosting
Пример #23
0
def load_model():
    # load model from file
    try:
        extreme_gradient = pickle.load(open(SAVED_MODEL_FILE, 'rb'))
    except Exception:
        return None
    log(action_logging_enum=INFO,
        logging_text="[EXTREME GRADIENT BOOSTING]: Model loaded.")

    return extreme_gradient
Пример #24
0
def open_dataset_XML_file(filename,
                          iterateable,
                          label=None,
                          url_label="url",
                          label_label=None,
                          max_line_count=-1):
    """
    open xml file and write to list
    filename: xml filename
    iterateable: node containing data
    label: label for all data
    url_label: url specifier in xml data
    label_label: label specifiert in xml data
    max_line_count: maximum entries to be written to list
    """

    PATH = ""
    if not os.path.isfile(DATA_PATH + filename):
        log(action_logging_enum=WARNING,
            logging_text="File [{f}] does not exist.".format(f=filename))
        log(action_logging_enum=INFO, logging_text="Trying in backup folder.")

        if not os.path.isfile(DATA_BACKUP_PATH + filename):
            log(action_logging_enum=ERROR,
                logging_text="File [{f}] does even not exist in backup folder."
                .format(f=filename))
            return None
        else:
            PATH = DATA_BACKUP_PATH
            log(action_logging_enum=INFO,
                logging_text="Found in backup folder.")
    else:
        PATH = DATA_PATH

    datalist = []
    parser = et.XMLParser(strip_cdata=False)
    xtree = et.parse(PATH + filename, parser=parser)
    root = xtree.getroot()
    index = 1

    for entry in root.iter(iterateable):
        url = entry.find(url_label).text
        e = Entry(entry.find(label_label).text,
                  url) if not (label_label == None) else Entry(label, url)
        datalist.append(e)

        if index == max_line_count:
            break

        index += 1

    log(action_logging_enum=INFO,
        logging_text="XML File filled in list. FILE: [{f}].".format(
            f=filename))
    return datalist
Пример #25
0
def load_model():
    # load model from file
    try:
        knn = pickle.load(open(SAVED_MODEL_FILE, 'rb'))
    except Exception:
        return None

    log(action_logging_enum=INFO,
        logging_text="[K-NEAREST NEIGHBOR]: Model loaded.")

    return knn
Пример #26
0
def delete_data(filename):
    """
    delete file for param filename in data_path
    """
    if os.path.isfile(DATA_PATH + filename):
        os.remove(DATA_PATH + filename)
        log(action_logging_enum=INFO,
            logging_text="File [{f}] deleted.".format(f=DATA_PATH + filename))
    else:
        log(action_logging_enum=WARNING,
            logging_text="File [{f}] dose not exists.".format(f=filename))
Пример #27
0
def save_last_score(prec_train, prec_test, recall_train, recall_test, f1_train, f1_test):
    with open(SCORE_FILE, 'w') as file:
        file.write("Precision Train: " + str(prec_train) + "\n")
        file.write("Precision Test: " + str(prec_test) + "\n")
        file.write("Recall Train: " + str(recall_train) + "\n")
        file.write("Recall Test: " + str(recall_test) + "\n")
        file.write("F1-Score Train: " + str(f1_train) + "\n")
        file.write("F1-Score Test: " + str(f1_test) + "\n")
        file.close()

    log(action_logging_enum=INFO, logging_text="[ADAPTIVE BOOSTING]: score saved in [{f}]".format(f=SCORE_FILE))
    return
Пример #28
0
def download_file(url, filename):
    """
    download database file from url and save by filename
    url: url where data has to be downloaded
    filename: filename for data to be saved
    """

    response = requests.get(url, timeout=10, headers=headers)
    with open(DATA_PATH + filename, 'wb') as file:
        file.write(response.content)
    log(action_logging_enum=INFO,
        logging_text="Download completed for [{f}].".format(f=filename))
Пример #29
0
    def merge_dataframes(dataframe):
        try:
            nonlocal df

            if not pd.DataFrame(dataframe).empty:

                if df.empty:
                    df = dataframe
                else:
                    df = pd.concat([df, dataframe], ignore_index=True)
        except Exception as e:
            log(action_logging_enum=WARNING, logging_text=str(e))
Пример #30
0
    def fetch_results(search_term, number_results, language_code):
        escaped_search_term = search_term.replace(' ', '+')

        google_url = 'https://www.google.de/search?q={}&num={}&hl={}'.format(
            escaped_search_term, number_results + 1, language_code)

        try:
            response = requests.get(google_url, headers=usr_agent, timeout=10)
            response.raise_for_status()
        except Exception as e:
            try:
                time.sleep(2)
                response = requests.get(google_url,
                                        headers=usr_agent,
                                        timeout=10)
            except Exception as e:
                log(action_logging_enum=ERROR,
                    logging_text=
                    "[Function search]: An error occured while querying the Google API."
                    )
                log(action_logging_enum=INFO,
                    logging_text="[Function search]: Error description: {err}".
                    format(err=str(e)))
                return -1

        status_code = int(response.status_code)

        if status_code >= 200 and status_code < 400:
            return response.content
        else:
            if wait_after_429:
                while status_code < 200 or status_code >= 400:
                    log(action_logging_enum=WARNING,
                        logging_text="Google Search returned Status Code 429. "
                        "Next check in: 5 minutes.")
                    time.sleep(300)
                    try:
                        response = requests.get(google_url,
                                                headers=usr_agent,
                                                timeout=10)
                        status_code = int(response.status_code)
                    except Exception as e:
                        log(action_logging_enum=ERROR,
                            logging_text=
                            "An error occured while querying the Google API.")
                        log(action_logging_enum=INFO,
                            logging_text="Error description: {err}".format(
                                err=str(e)))
                return response.content

        return -1