Пример #1
0
def clean_exit(error_code, message, kill_generator=False):
    """ Performs a clean exit, useful for when errors happen that can't be recovered from."""
    logger.log_critical(module_name, message)
    if kill_generator:
        generator.stop_generator(2)
    logger.log_stop()
    sys.exit(error_code)
Пример #2
0
def test_reader():
    from sys import argv, exit
    import traceback

    if len(argv) < 4:
        print(argv[0], '<input_file>', '<memory_file>', '<bin_dir>')
        exit(0)

    logger.log_start(logging.DEBUG)

    try:
        ofile = tempfile.mkstemp(text=True)
        ofilefd = fdopen(ofile[0], 'w')

        mem_map = read_memory_file(argv[2])

        for tuple in disasm_pt_file(argv[1], argv[3], mem_map):
            if tuple is None:
                break
            ofilefd.write(str(tuple) + "\n")

        ofilefd.close()
    except:
        traceback.print_exc()
        ofilefd.close()
        remove(ofile[1])
        logger.log_stop()
        exit(1)

    logger.log_info(module_name, 'Wrote generated tuples to ' + str(ofile[1]))
    logger.log_stop()
Пример #3
0
def test_generator():
    from sys import argv, exit
    import reader
    import tempfile

    if len(argv) < 5:
        print(argv[0], '<input_file>', '<bin_dir>', '<memory_file>',
              '<seq_len>')
        exit(0)

    logger.log_start(logging.DEBUG)

    try:
        ofile = tempfile.mkstemp(text=True)
        ofilefd = os.fdopen(ofile[0], 'w')

        filters.set_filters(['ret'])
        memory = reader.read_memory_file(argv[3])

        input, output = start_generator(2,
                                        reader.disasm_pt_file,
                                        seq_len=int(argv[4], 10))
        input.put((None, argv[1], argv[2], memory))
        while True:
            try:
                res = output.get(True, 5)
            except queue.Empty:
                count = get_in_service()
                if get_in_service() == 0:
                    break
                else:
                    logger.log_debug(
                        module_name,
                        str(count) + ' workers still working on jobs')
                    continue
            ofilefd.write(str(res[0]) + ": " + str(res[1]) + "\n")

        stop_generator(10)
        ofilefd.close()
    except:
        traceback.print_exc()
        ofilefd.close()
        os.remove(ofile[1])
        logger.log_stop()
        exit(1)

    logger.log_info(module_name, 'Wrote generated tuples to ' + str(ofile[1]))
    logger.log_stop()
Пример #4
0
def clean_exit(error_code, message):
    """ Performs a clean exit, useful for when errors happen that can't be recovered from."""
    logger.log_critical(MODULE_NAME, message)
    logger.log_stop()
    sys.exit(error_code)
Пример #5
0
    if not options.skip_test:
        logger.log_info(MODULE_NAME, 'Starting testing')
        try:
            test_model(sets_meta['b_test'])
        except KeyboardInterrupt:
            clean_exit(EXIT_USER_INTERRUPT,
                       'Keyboard interrupt, cleaning up...')
        except:
            clean_exit(EXIT_RUNTIME_ERROR,
                       "Unexpected error:\n" + str(traceback.format_exc()))
    else:
        logger.log_info(MODULE_NAME, 'Skipping testing')

    # Evaluating model
    if not options.skip_eval:
        logger.log_info(MODULE_NAME, 'Starting evaluation')
        try:
            eval_model(sets_meta['b_test'] + sets_meta['m_test'])
        except KeyboardInterrupt:
            clean_exit(EXIT_USER_INTERRUPT,
                       'Keyboard interrupt, cleaning up...')
        except:
            clean_exit(EXIT_RUNTIME_ERROR,
                       "Unexpected error:\n" + str(traceback.format_exc()))
    else:
        logger.log_info(MODULE_NAME, 'Skipping evaluation')

    # Cleanup
    logger.log_info(MODULE_NAME, 'Cleaning up and exiting')
    logger.log_stop()
Пример #6
0
def main():
    global edges, max_seq

    # Parse input arguments
    parser = OptionParser(
        usage='Usage: %prog [options] pt_trace_dir output_file')
    parser.add_option('-r',
                      '--parse-ret',
                      action='store_true',
                      dest='parse_ret',
                      help='Consider returns')
    parser.add_option('-c',
                      '--parse-icall',
                      action='store_true',
                      dest='parse_icall',
                      help='Consider indirect calls')
    parser.add_option('-j',
                      '--parse-ijmp',
                      action='store_true',
                      dest='parse_ijmp',
                      help='Consider indirect jumps')
    parser.add_option('-s',
                      '--sequence-length',
                      action='store',
                      dest='max_seq',
                      type='int',
                      default=32,
                      help='Max sequence length to calculate (default: 32)')

    options, args = parser.parse_args()

    if len(args) != 2:
        parser.print_help()
        sys.stdout.write("\n  Note: Only preprocessed traces are supported\n")
        sys.exit(1)

    trace_filepath = os.path.join(args[0], 'trace_parsed.gz')
    opath = args[1]
    max_seq = options.max_seq

    # Input validation
    if not os.path.isfile(trace_filepath):
        sys.stderr.write('Error: ' + str(trace_filepath) +
                         " either does not exist or is not a file\n")
        sys.exit(1)

    if options.parse_ret:
        filters.add_filter('ret')

    if options.parse_icall:
        filters.add_filter('icall')

    if options.parse_ijmp:
        filters.add_filter('ijmp')

    if filters.get_num_enabled() == 0:
        sys.stderr.write(
            "Error: Must specify at least one thing to learn (-r, -c, -j)\n")
        sys.exit(1)

    # Initialization
    logger.log_start(20)
    history = list()  # History of past basic blocks

    # edges is a three-level dictionary where the keys for the first layer are sequence length,
    # the keys for the second layer are source BBID(s), and the keys for the third layer is
    # destination BBID. The value is count (i.e., how many times that (src, dst) pair has occurred.
    edges = dict()
    for seq_len in range(1, max_seq + 1):
        edges[seq_len] = dict()

    # Parsing
    logger.log_info(module_name, 'Parsing trace')
    for tuple in reader.read_preprocessed(trace_filepath):
        if tuple is None:
            break  # End of trace

        src_bbid, dst_bbid, instr = tuple[:3]

        # Update history
        history.append(src_bbid)
        if len(history) > max_seq:
            history.pop(0)

        if not True in [func(tuple) for func in filters.enabled_filters]:
            continue

        for seq_len in range(1, min(len(history), max_seq) + 1):
            insert(history[-seq_len:], dst_bbid)

    # Distribution of how many possible destinations sources have, up to df_max destinations.
    logger.log_info(module_name, 'Calculating distributions')
    df_max = 100
    df = np.zeros((max_seq, df_max), dtype=int)

    for seq_len in range(1, max_seq + 1):
        for src_bbid in edges[seq_len]:
            dst_size = len(edges[seq_len][src_bbid].keys())
            if dst_size <= df_max:
                df[seq_len - 1][dst_size - 1] += 1
            else:
                df[seq_len - 1][df_max - 1] += 1

    # Save statistics
    logger.log_info(module_name, 'Saving statistics to ' + str(opath))
    with open(opath, 'w') as ofile:
        # Header
        ofile.write('seq_len,' +
                    ','.join([str(x) for x in range(1, df_max + 1)]) + "\n")
        # Data
        for seq_len in range(1, max_seq + 1):
            ofile.write(
                str(seq_len) + ',' +
                ','.join([str(x) for x in df[seq_len - 1]]) + "\n")

    # Cleanup
    logger.log_stop()
Пример #7
0
def main():
    """Main"""
    global threshold

    parser = OptionParser(usage='Usage: %prog [options] eval_dir', version='Barnum Classifier ' + module_version)
    parser.add_option('-f', '--force', action='store_true',
                      help='Force threshold to produce no false positives (benign classified as malicious)')
    parser.add_option('-s', '--save', action='store', type='str', default=None,
                      help='Save classifier to given filepath (default: no saving)')
    parser.add_option('-l', '--load', action='store', type='str', default=None,
                      help='Use a previously saved classifier instead of making a new one')
    parser.add_option('-c', '--csv', action='store', type='str', default=None,
                      help='Save CSV of results to given filepath (default: no CSV)')
    parser.add_option('-p', '--plot', action='store', type='str', default=None,
                      help='Save plot as a PNG image to the given filepath (default: no plotting)')
    parser.add_option('-r', '--roc', action='store', type='str', default=None,
                      help='Save CSV plotting ROC curve to filepath (default: not saved)')
    parser.add_option('-w', '--workers', action='store', dest='workers', type='int', default=cpu_count(),
                      help='Number of workers to use (default: number of cores)')
    parser.add_option('-i', '--ignore-cache', action='store_true',
                      help='Do not use caching')

    options, args = parser.parse_args()

    if len(args) != 1 or options.workers < 1:
        parser.print_help()
        sys.exit(ERROR_INVALID_ARG)

    logger.log_start(20)
    logger.log_info(module_name, 'Barnum Classifier ' + module_version)

    idirpath = args[0]

    if not os.path.isdir(idirpath):
        logger.log_error(module_name, 'ERROR: ' + idirpath + " is not a directory")
        logger.log_stop()
        sys.exit(ERROR_INVALID_ARG)

    files = [os.path.join(idirpath, f) for f in os.listdir(idirpath) if os.path.isfile(os.path.join(idirpath, f))]
    num_benign = len([fp for fp in files if 'benign' in os.path.basename(fp)])
    num_malicious = len([fp for fp in files if 'malicious' in os.path.basename(fp)])

    if options.load is None and (num_benign == 0 or num_malicious == 0):
        logger.log_error(module_name, "Need at least 1 malicious and 1 benign sample to train a classifier")
        logger.log_stop()
        sys.exit(ERROR_INVALID_ARG)

    if not options.roc is None and (num_benign == 0 or num_malicious == 0):
        logger.log_error(module_name, "Need at least 1 malicious and 1 benign sample to plot a ROC curve")
        logger.log_stop()
        sys.exit(ERROR_INVALID_ARG)

    if not options.ignore_cache:
        init_cache()

    # Calculate average accuracy and confidence for each sample
    logger.log_info(module_name, "Parsing " + idirpath)
    pool = Pool(options.workers)
    data = [sample for sample in pool.map(parse_file, zip(files, [options] * len(files))) if sample[0] < 2]
    pool.close()
    ys = np.array([sample[0] for sample in data])
    xs = np.array([sample[1:3] for sample in data])

    if options.load is None:
        logger.log_info(module_name, "Creating classifier")
        # Train a new classifier from scratch
        if options.force:
            # Use ADASYN to over sample the benign class until FP falls to 0
            warnings.filterwarnings("ignore", module="imblearn")
            fp = 1.0
            ben_cnt = len([y for y in ys if y == 0])
            mal_cnt = len(ys) - ben_cnt
            ben_step = max(1, int(ben_cnt * 0.1))

            while fp > 0.0:
                ben_cnt += ben_step
                try:
                    xs_os, ys_os = ADASYN({0: ben_cnt, 1: mal_cnt}, n_jobs=options.workers).fit_resample(xs, ys)
                except ValueError:
                    continue  # Happens if change in counts produces too little change in ratio

                svm = SVC(kernel='linear')
                svm.fit(xs_os, ys_os)

                results = [[sample, svm.predict([sample[1:3]])] for sample in data]
                benign = [sample for sample in results if sample[0][0] == 0]
                fps = [sample for sample in results if sample[0][0] == 0 and sample[1] == 1]
                fp = float(len(fps)) / float(len(benign))
        else:
            svm = SVC(kernel='linear')
            svm.fit(xs, ys)
    else:
        # Use a previously saved classifier
        logger.log_info(module_name, "Loading classifier from " + options.load)
        try:
            svm = joblib.load(options.load)
            nu = None
        except Exception as ex:
            logger.log_error(module_name, "Failed to load classifier: " + str(ex))
            logger.log_stop()
            sys.exit(ERROR_RUNTIME)

    # Metrics
    results = [[sample, svm.predict([sample[1:3]])] for sample in data]
    benign = [sample for sample in results if sample[0][0] == 0]
    malicious = [sample for sample in results if sample[0][0] == 1]
    fps = [sample for sample in results if sample[0][0] == 0 and sample[1] == 1]
    fns = [sample for sample in results if sample[0][0] == 1 and sample[1] == 0]

    if len(benign) > 0:
        fp = float(len(fps)) / float(len(benign))
    else:
        fp = 'N/A'
    if len(malicious) > 0:
        fn = float(len(fns)) / float(len(malicious))
    else:
        fn = 'N/A'

    logger.log_info(module_name, "----------")
    logger.log_info(module_name, "FP: " + str(fp))
    logger.log_info(module_name, "FN: " + str(fn))
    logger.log_info(module_name, "----------")

    # Saving CSV
    if not options.csv is None:
        logger.log_info(module_name, "Saving CSV to " + options.csv)
        try:
            with open(options.csv, 'w') as csv_file:
                csv_file.write("true_label,pred_label,avg_accuracy,avg_confidence,name\n")
                for result in results:
                    csv_file.write(','.join([str(result[0][0]), str(result[1][0]), str(result[0][1]), str(result[0][2]), result[0][3]]) + "\n")
        except Exception as ex:
            module.log_error(module_name, "Failed to save CSV: " + str(ex))

    # Saving Classifier
    if not options.save is None:
        logger.log_info(module_name, "Saving classifier to " + options.save)
        try:
            joblib.dump(svm, options.save)
        except:
            logger.log_error(module_name, "Failed to save classifier to " + options.save)

    # Plotting
    if not options.plot is None:
        logger.log_info(module_name, "Saving plot to " + options.plot)
        axes = plt.gca()
        axes.set_xlim([0, 1])
        axes.set_ylim([0, 1])
        w = svm.coef_[0]
        a = -w[0] / w[1]
        xx = np.linspace(0, 1)
        yy = a * xx - (svm.intercept_[0]) / w[1]
        plt.scatter([sample[0][1] for sample in benign], [sample[0][2] for sample in benign], marker='o', c='blue', s=20)
        plt.scatter([sample[0][1] for sample in malicious], [sample[0][2] for sample in malicious], marker='x', c='red', s=20)
        plt.plot(xx, yy, 'k--')
        plt.xlabel('Wrong Prediction (%)')
        plt.ylabel('Average Confidence (%)')
        try:
            plt.savefig(options.plot)
        except:
            logger.log_error(module_name, "Failed to save plot")

    # ROC
    if not options.roc is None:
        logger.log_info(module_name, "Saving ROC to " + options.roc)
        make_roc(options.roc, data, svm)

    logger.log_stop()
Пример #8
0
def main():
    # Parse input arguments
    parser = OptionParser(
        usage='Usage: %prog [options] trace_directory bin_directory')
    parser.add_option(
        '-f',
        '--force',
        action='store_true',
        help='If a complete or partial output already exists, overwrite it.')
    parser.add_option(
        '-t',
        '--timeout',
        action='store',
        type='int',
        default=None,
        help='Max seconds to run before quitting (default: infinite).')
    parser.add_option(
        '-p',
        '--no-partial',
        action='store_true',
        help='If timeout is reached, do not save the partially parsed trace.')
    options, args = parser.parse_args()

    if len(args) < 2:
        parser.print_help()
        sys.exit(0)

    data_dir = args[0]
    bin_dir = args[1]

    logger.log_start(logging.INFO)

    # Input validation
    if not os.path.isdir(data_dir):
        logger.log_error(module_name, data_dir + ' is not a directory')
        logger.log_stop()
        sys.exit(1)

    if not os.path.isdir(bin_dir):
        logger.log_error(module_name, bin_dir + ' is not a directory')
        logger.log_stop()
        sys.exit(1)

    if options.timeout is None and options.no_partial:
        logger.log_warning(
            module_name, "Setting --no-partial without --timeout does nothing")

    # Make sure all the expected files are there
    mem_file = None
    trace_file = None

    files = os.listdir(data_dir)
    for file in files:
        if file == 'mapping.txt' or file == 'mapping.txt.gz':
            mem_file = os.path.join(data_dir, file)
        elif file == 'trace_0' or file == 'trace_0.gz':
            trace_file = os.path.join(data_dir, file)

    if mem_file is None:
        logger.log_error(
            module_name,
            'Could not find mapping.txt or mapping.txt.gz in ' + data_dir)
        logger.log_stop()
        sys.exit(1)

    if trace_file is None:
        logger.log_error(module_name,
                         'Could not find trace_0 or trace_0.gz in ' + data_dir)
        logger.log_stop()
        sys.exit(1)

    # Parse the memory file
    mem_map = reader.read_memory_file(mem_file)
    if mem_map is None:
        logger.log_error(module_name, 'Failed to parse memory mapping file')
        logger.log_stop()
        sys.exit(1)

    # We're ready to parse the trace
    o_filepath = os.path.join(data_dir, 'trace_parsed.gz')

    if os.path.isfile(o_filepath) and not options.force:
        logger.log_error(module_name, 'Preprocess file already exists')
        logger.log_stop()
        sys.exit(1)

    if os.path.isfile(o_filepath + '.part') and not options.force:
        logger.log_error(module_name, 'Partial preprocess file already exists')
        logger.log_stop()
        sys.exit(1)

    entries = 0
    with gzip.open(o_filepath + '.part', 'wb') as ofile:
        for instr in reader.disasm_pt_file(trace_file, bin_dir, mem_map,
                                           options.timeout):
            if instr is None:
                break
            ofile.write(pack_instr(instr))
            entries += 1

    if reader.DISASM_TIMEOUT.is_set() and options.no_partial:
        logger.log_info(module_name, "Deleting partial trace")
        os.remove(o_filepath + '.part')
    elif entries > 0:
        os.rename(o_filepath + '.part', o_filepath)
    else:
        logger.log_error(module_name, 'No output produced, empty file')
        os.remove(o_filepath + '.part')

    logger.log_stop()
Пример #9
0
def main():
    """Main"""
    parser = OptionParser(usage='Usage: %prog [options] eval_dir',
                          version='Barnum Cluster ' + module_version)
    parser.add_option(
        '-c',
        '--csv',
        action='store',
        type='str',
        default=None,
        help='Save CSV of results to given filepath (default: no CSV)')
    parser.add_option(
        '-p',
        '--plot',
        action='store',
        type='str',
        default=None,
        help=
        'Save plot as a PNG image to the given filepath (default: no plotting)'
    )
    parser.add_option(
        '-w',
        '--workers',
        action='store',
        dest='workers',
        type='int',
        default=cpu_count(),
        help='Number of workers to use (default: number of cores)')
    parser.add_option('--max-classes',
                      action='store',
                      type='int',
                      default=256,
                      help='How many classes to use (default: 256)')
    parser.add_option(
        '--min-samples',
        action='store',
        type='int',
        default=4,
        help='Minimum samples to form a cluster in DBSCAN (default: 4)')
    parser.add_option('--eps',
                      action='store',
                      type='float',
                      default=0.03,
                      help='Epsilon parameter to DBSCAN (default: 0.03)')

    options, args = parser.parse_args()

    if len(args) != 1 or options.workers < 1:
        parser.print_help()
        sys.exit(ERROR_INVALID_ARG)

    logger.log_start(20)
    logger.log_info(module_name, 'Barnum Cluster %s' % module_version)

    idirpath = args[0]

    if not os.path.isdir(idirpath):
        logger.log_error(module_name,
                         'ERROR: %s is not a directory' % idirpath)
        logger.log_stop()
        sys.exit(ERROR_INVALID_ARG)

    files = [
        os.path.join(idirpath, f) for f in os.listdir(idirpath)
        if os.path.isfile(os.path.join(idirpath, f))
    ]
    # We only care about clustering malicious traces
    mal_files = [fp for fp in files if 'malicious' in os.path.basename(fp)]
    num_mal = len(mal_files)

    # Calculate clustering metrics
    logger.log_info(module_name, "Parsing " + idirpath)
    pool = Pool(options.workers)
    data = [
        sample for sample in pool.map(
            parse_file, zip(mal_files, [options.max_classes] * num_mal))
        if sample
    ]
    pool.close()
    xs = np.array([sample[0] for sample in data])
    ns = [sample[1] for sample in data]

    # Clustering
    logger.log_info(module_name, "Calculating clusters")
    db = DBSCAN(eps=options.eps, min_samples=options.min_samples).fit(xs)
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_

    # Number of clusters in labels, ignoring noise if present.
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise = list(labels).count(-1)
    logger.log_info(module_name, '      Number of points: %d' % len(ns))
    logger.log_info(module_name, '    Number of clusters: %d' % n_clusters)
    logger.log_info(module_name, 'Number of noise points: %d' % n_noise)

    # Saving results as CSV
    if not options.csv is None:
        logger.log_info(module_name, "Saving CSV to %s" % options.csv)
        try:
            with open(options.csv, 'w') as csv_file:
                csv_file.write("cluster,filename\n")
                for label, name in zip(labels, ns):
                    csv_file.write(','.join([str(label), name]) + "\n")
        except Exception as ex:
            logger.log_error(module_name, "Failed to save CSV: %s" % str(ex))

    # Saving results as plot image
    if not options.plot is None:
        logger.log_info(module_name, "Generating plot")
        theta = radar_factory(options.max_classes, frame='polygon')
        fig, axes = plt.subplots(subplot_kw=dict(projection='radar'))
        colors = ['b', 'r', 'g', 'm', 'y']
        axes.set_varlabels([""])  # no varlabels, they aren't that meaningful
        axes.set_rgrids([0.2, 0.4, 0.6, 0.8])
        legend_labels = list()
        for label_key in set(labels):
            if label_key == -1:
                continue  # noise
            legend_labels.append(label_key)
            label_color = colors[label_key % len(colors)]
            # Calculate per-cluster average
            label_mask = (labels == label_key)
            label_points = xs[label_mask & core_samples_mask]
            label_means = np.mean(label_points, axis=0)
            axes.plot(theta, label_means, color=label_color)
            axes.fill(theta, label_means, facecolor=label_color, alpha=0.25)
        # Legend
        legend = axes.legend(legend_labels,
                             loc=(0.9, .95),
                             labelspacing=0.1,
                             fontsize='small')

        try:
            plt.savefig(options.plot)
        except:
            logger.log_error(module_name, "Failed to save plot")

    logger.log_stop()