Exemplo n.º 1
0
def handle_features_1dir(samples_dir, label, level, features_choice, n,
                         analysis_path):
    """ handle_features_1file for ALL files from a directory.
    Case one folder. """

    if not os.path.exists(analysis_path):
        os.makedirs(analysis_path)

    pickle_path = os.path.join(analysis_path, features_choice,
                               level + '_all_features_' + label)
    utility.check_folder_exists(pickle_path)

    if os.path.isfile(pickle_path):
        all_features_dict = pickle.load(open(pickle_path, 'rb'))
    else:
        all_features_dict = dict()

    analyses = get_features_all_files_multiproc(samples_dir, level,
                                                features_choice, n)

    start = timeit.default_timer()

    for analysis in analyses:
        features_dict = analysis.features
        if features_dict is not None:
            try:
                handle_features_1file(features_dict, all_features_dict)
            except:
                logging.exception('Something went wrong with %s',
                                  analysis.file_path)

    pickle.dump(all_features_dict, open(pickle_path, 'wb'))
    utility.micro_benchmark('Total elapsed time:',
                            timeit.default_timer() - start)
Exemplo n.º 2
0
def get_features_all_files_multiproc(samples_dir_list, labels_list, level,
                                     features_choice, n):
    """
        Gets the features of all files from samples_dir_list.
    """

    start = timeit.default_timer()
    ram = psutil.virtual_memory().used

    my_queue = Queue()
    out_queue = Queue()
    except_queue = Queue()
    workers = list()

    for i, _ in enumerate(samples_dir_list):
        samples_dir = samples_dir_list[i]
        label = labels_list[i]
        for sample in os.listdir(samples_dir):
            sample_path = os.path.join(samples_dir, sample)
            analysis = static_analysis.Analysis(pdg_path=sample_path,
                                                label=label)
            my_queue.put([analysis, level, features_choice, n])

    for i in range(utility.NUM_WORKERS):
        p = Process(target=features_preselection.worker_get_features,
                    args=(my_queue, out_queue, except_queue))
        p.start()
        workers.append(p)

    analyses = list()

    while True:
        try:
            analysis = out_queue.get(timeout=0.01)
            analyses.append(analysis)
        except queue.Empty:
            pass
        all_exited = True
        for w in workers:  # Instead of join, as the worker cannot be joined when elements
            # are still in except_queue or out_queue, so it deadlocks. But they must be
            # joined before the elements are taken out of the queue
            if w.exitcode is None:
                all_exited = False
                break
        if all_exited & out_queue.empty():
            break

    utility.get_ram_usage(psutil.virtual_memory().used - ram)
    utility.micro_benchmark('Total elapsed time for features production:',
                            timeit.default_timer() - start)

    return analyses
Exemplo n.º 3
0
def get_features_all_files_multiproc(samples_dir_list, labels_list):
    """
        Gets the features of all files from samples_dir_list.
    """

    start = timeit.default_timer()

    my_queue = Queue()
    out_queue = Queue()
    except_queue = Queue()
    workers = list()

    for i, _ in enumerate(samples_dir_list):
        samples_dir = samples_dir_list[i]
        label = labels_list[i]
        for sample in os.listdir(samples_dir):
            sample_path = os.path.join(samples_dir, sample)
            analysis = features_preselection.Analysis(file_path=sample_path,
                                                      label=label)
            my_queue.put(analysis)

    for i in range(utility.NUM_WORKERS):
        p = Process(target=features_preselection.worker_get_features,
                    args=(my_queue, out_queue, except_queue))
        p.start()
        workers.append(p)

    analyses = list()

    while True:
        try:
            analysis = out_queue.get(timeout=0.01)
            analyses.append(analysis)
        except queue.Empty:
            pass
        all_exited = True
        for w in workers:
            if w.exitcode is None:
                all_exited = False
                break
        if all_exited & out_queue.empty():
            break

    utility.micro_benchmark('Total elapsed time for features production:',
                            timeit.default_timer() - start)

    return analyses
Exemplo n.º 4
0
def analyze_features_all(all_features_dict1, all_features_dict2,
                         samples_dir_list, labels_list, analysis_path,
                         path_info):
    """ Produces a dict containing the number of occurrences (or not) of each expected feature
    with a distinction between benign and malicious files. """

    if len(samples_dir_list) != len(labels_list):
        logging.error(
            "Something is wrong with the size of samples_dir_list and label_list."
            + " Got %s and %s", str(len(samples_dir_list)),
            str(len(labels_list)))
        return None

    if "benign" not in labels_list and "malicious" not in labels_list:
        logging.error("Expected 'benign' and 'malicious' in labels_list")
        return None

    if not os.path.exists(analysis_path):
        os.makedirs(analysis_path)

    pickle_path = os.path.join(analysis_path,
                               '_analyzed_features_' + path_info)

    analyzed_features_dict = initialize_analyzed_features_dict(
        all_features_dict1, all_features_dict2)

    analyses = get_features_all_files_multiproc(samples_dir_list, labels_list)

    start = timeit.default_timer()

    for analysis in analyses:
        features_dict = analysis.features
        label = analysis.label
        if features_dict is not None:
            analyze_features(analyzed_features_dict, features_dict, label)

    pickle.dump(analyzed_features_dict, open(pickle_path, 'wb'))

    utility.micro_benchmark('Total elapsed time:',
                            timeit.default_timer() - start)
    return analyzed_features_dict
Exemplo n.º 5
0
def main_analysis(js_dirs, js_files, labels_files, labels_dirs,
                  features2int_dict_path):
    """
        Main function, performs a static analysis (syntactic using the AST)
        of JavaScript files given in input.

        -------
        Parameters:
        - js_dirs: list of strings
            Directories containing the JS files to be analysed.
        - js_files: list of strings
            Files to be analysed.
        - labels_files: list of strings
            True label's name of the current data: either benign or malicious.
            One label for one file.
        - labels_dirs: list of strings
            True label's name of the current data: either benign or malicious.
            One label for one directory.
        - features2int_dict_path: str
            Path of the dictionary mapping features to int.

        -------
        Returns:
        -list:
            Contains the results of the static analysis of the files given as input.
            * 1st element: list containing valid files' name (i.e. files that could be parsed);
            * 2nd element: sparse matrix containing the features results;
            * 3rd element: list containing the true labels of the valid JS files.

    """

    start = timeit.default_timer()

    global features2int_dict
    features2int_dict = pickle.load(open(features2int_dict_path, 'rb'))

    if js_dirs is None and js_files is None:
        logging.error(
            'Please, indicate a directory or a JS file to be studied')

    else:
        if js_files is not None:
            files2do = js_files
            if labels_files is None:
                labels_files = ['?' for _, _ in enumerate(js_files)]
            labels = labels_files
        else:
            files2do, labels = [], []
        if js_dirs is not None:
            i = 0
            if labels_dirs is None:
                labels_dirs = ['?' for _, _ in enumerate(js_dirs)]
            for cdir in js_dirs:
                for cfile in os.listdir(cdir):
                    files2do.append(os.path.join(cdir, cfile))
                    if labels_dirs is not None:
                        labels.append(labels_dirs[i])
                i += 1

        analyses = get_features(files2do, labels)
        logging.info('Got all features')
        features_repr = get_features_representation(analyses)

        utility.micro_benchmark('Total elapsed time:',
                                timeit.default_timer() - start)

        return features_repr
Exemplo n.º 6
0
def main_analysis(js_dirs, js_files, labels_files, labels_dirs, level,
                  features_choice, n, features2int_dict_path):
    """
        Main function, performs a static analysis (syntactic using the AST)
        of JavaScript files given in input.

        -------
        Parameters:
        - js_dirs: list of strings
            Directories containing the JS files to be analysed.
        - js_files: list of strings
            Files to be analysed.
        - labels_files: list of strings
            True label's name of the current data: either benign or malicious.
            One label for one file.
        - labels_dirs: list of strings
            True label's name of the current data: either benign or malicious.
            One label for one directory.
        - n: int
            Stands for the size of the sliding-window which goes through the units contained
            in the files to be analysed.
        - level: str
            Either 'tokens', 'ast', 'cfg', or 'pdg' depending on the units you want to extract.

        -------
        Returns:
        -list:
            Contains the results of the static analysis of the files given as input.
            * 1st element: list containing valid files' name (i.e. files that could be parsed);
            * 2nd element: list / csr_matrix representing the analysis results (n-grams frequency)
            with one line per valid JS file;
            * 3rd element: list containing the true labels of the valid JS files.
    """

    start = timeit.default_timer()
    ram = psutil.virtual_memory().used

    global features2int_dict
    features2int_dict = pickle.load(open(features2int_dict_path, 'rb'))

    if js_dirs is None and js_files is None:
        logging.error(
            'Please, indicate a directory or a JS file to be studied')

    else:
        if js_files is not None:
            files2do = js_files
            if labels_files is None:
                labels_files = ['?' for _, _ in enumerate(js_files)]
            labels = labels_files
        else:
            files2do, labels = [], []
        if js_dirs is not None:
            i = 0
            if labels_dirs is None:
                labels_dirs = ['?' for _, _ in enumerate(js_dirs)]
            for cdir in js_dirs:
                for cfile in os.listdir(cdir):
                    files2do.append(os.path.join(cdir, cfile))
                    if labels_dirs is not None:
                        labels.append(labels_dirs[i])
                i += 1

        analyses = get_features(files2do, labels, level, features_choice, n)
        logging.info('Got all features')
        features_repr = get_features_representation(analyses)

        utility.get_ram_usage(psutil.virtual_memory().used - ram)
        utility.micro_benchmark('Total elapsed time:',
                                timeit.default_timer() - start)

        return features_repr