Exemplo n.º 1
0
    def model_first_predict():
        '''
        '''
        # self.blackbox_detector
        for filename in tqdm.tqdm(self.dataset_iterator(args.datadir)):
            filepath = os.path.join(args.datadir, filename)
            binary = open(filepath, 'rb').read()

            ember.predict_sample(self.blackbox_detector, binary)
Exemplo n.º 2
0
def predict():
    """
    Predcit new datasets
    """
    y_pred = []
    name = []
    err = 0
    end = len(next(os.walk(args.datadir))[2])

    for sample in tqdm.tqdm(sample_iterator(), total=end):
        fullpath = os.path.join(args.datadir, sample)

        if os.path.isfile(fullpath):
            binary = open(fullpath, "rb").read()
            name.append(sample)

            try:
                y_pred.append(ember.predict_sample(lgbm_model, binary))           
            except KeyboardInterrupt:
                sys.exit()
            except Exception as e:
                y_pred.append(0)
                print("{}: {} error is occuered".format(sample, e))
                err += 1
                
    series = OrderedDict([('hash', name),('y_pred', y_pred)])
    r = pd.DataFrame.from_dict(series)
    r.to_csv(os.path.join(args.output, 'result.csv'), index=False, header=None)

    return err
Exemplo n.º 3
0
def main():
    prog = "classify_binaries"
    descr = "Use a trained ember model to make predictions on PE files"
    parser = argparse.ArgumentParser(prog=prog, description=descr)
    parser.add_argument("-m",
                        "--modelpath",
                        type=str,
                        default=None,
                        required=True,
                        help="Ember model")
    parser.add_argument("binaries",
                        metavar="BINARIES",
                        type=str,
                        nargs="+",
                        help="PE files to classify")
    args = parser.parse_args()

    if not os.path.exists(args.modelpath):
        parser.error("ember model {} does not exist".format(args.modelpath))
    lgbm_model = lgb.Booster(model_file=args.modelpath)

    for binary_path in args.binaries:
        if not os.path.exists(binary_path):
            print("{} does not exist".format(binary_path))

        file_data = open(binary_path, "rb").read()
        score = ember.predict_sample(lgbm_model, file_data)

        if len(args.binaries) == 1:
            print(score)

        else:
            print("\t".join((binary_path, str(score))))
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-m", "--modelpath", type=str, required=True, help="trained model path")
    parser.add_argument("-d", "--datadir", type=str, help="Directory for predicting dataSets", required=True)
    #parser.add_argument("-c", "--csv", type=str, help="Answer file", required=True)
    parser.add_argument("-o", "--output", type=str, help="output label and y_pred", required=True)
    args = parser.parse_args()

    if not os.path.exists(args.modelpath):
        parser.error("ember model {} does not exist".format(args.modelpath))   
    # if not os.path.exists(args.csv):
    #     parser.error("ember model {} does not exist".format(args.csv))
    if not os.path.exists(args.datadir):
        parser.error("ember model {} does not exist".format(args.datadir))
    if not os.path.exists(args.output):
        os.mkdir(args.output)

    #model_path = os.path.join(args.modelpath, "model.txt")
    lgbm_model = lgb.Booster(model_file=args.modelpath)

    #read answer sheet
    #data = pd.read_csv(args.csv, names=['hash', 'y'])

    errorcount = 0
    y_pred = []
    #y = []
    _name = []

    for filename in tqdm.tqdm(os.listdir(args.datadir)):
        _file = os.path.join(args.datadir, filename)

        if os.path.isfile(_file):
            binary = open(_file, "rb").read()
            _name.append(filename)
            #y.append(data[data.hash == filename].values[0][1])

            try:
                y_pred.append(ember.predict_sample(lgbm_model, binary))           
            except KeyboardInterrupt:
                sys.exit()
            except:
                y_pred.append(0)
                errorcount += 1
                
    #print and save accuracy
    y_pred_01 = np.array(y_pred)
    #y_pred_01 = np.where(y_pred_01 > 0.75, 1, 0)   

    #save csv
    #raw_predict = pd.DataFrame({'hash': _name, 'y': y, 'ypred': y_pred_01})
    #raw_predict.to_csv(os.path.join(args.output, 'predict_with_label.csv'), index=False, header=None)

    r = pd.DataFrame({'hash': _name, 'y_pred': y_pred_01})
    r.to_csv(os.path.join(args.output, 'result.csv'), index=False, header=None)

    #print error count
    print("Error : %d" % (errorcount))
Exemplo n.º 5
0
def scan(filelist, conf=DEFAULTCONF):
    results = []

    for fname in filelist:
        # Ensure libmagic returns results
        if REQUIRES[0] is not None:
            # only run the analytic if it is an Office document
            file_type = _get_libmagicresults(REQUIRES[0][0], fname)
            if file_type.startswith('PE32'):
                with open(fname, 'rb') as fh:
                    ember_result = ember.predict_sample(LGBM_MODEL, fh.read())
                results.append(
                    (fname, {'Prediction': ember_result})
                )

    metadata = {}
    metadata["Name"] = NAME
    metadata["Type"] = TYPE
    return (results, metadata)
 def predict(self, bytez):
     return predict_sample(self.model, bytez) > self.thresh
Exemplo n.º 7
0
 def predict_sample(self, binary_data):
     score = ember.predict_sample(self.lgbm_model, binary_data)
     return score
Exemplo n.º 8
0
import ember
import lightgbm as lgb
import sys

# verify the number of arguments
argc = len(sys.argv)
if (argc != 2):
    print("Usage: python {0} <file>".format(sys.argv[0]))
    exit(1)

# load and parse PE file
file = sys.argv[1]

lgbm_model = lgb.Booster(model_file="/home/geoffryaf/Desktop/MDP/ember_dataset_2018_2/ember2018/ember_model_2018.txt")
putty_data = open(file, "rb").read()
print(ember.predict_sample(lgbm_model, putty_data))
Exemplo n.º 9
0
def analysis(filename=0, data=0, mode=0, mfl=False, ip=''):
    FLAG_S=False
    FLAG_B = False
    conn = sqlite3.connect("MalDet.db")
    c = conn.cursor()
    if data:
        path_to_file = os.path.join(UPLOAD_DIRECTORY,filename)
        with open(path_to_file, "wb") as fp:
            fp.write(data)
    if filename and not data:
        if mfl:
            path_to_file = filename
        else:
            path_to_file = os.path.join(UPLOAD_DIRECTORY,filename) 
        with open(path_to_file, 'rb') as f:
            data = f.read()
    try:
        check = pefile.PE(path_to_file)
    except pefile.PEFormatError:
        err = {'error': 'this is not PE file'}
        return err
    imphash, ssdeep_hash, sha = hash_calc(path_to_file)
    c.execute("""SELECT sha_1 from MalDet_S
    where sha_1=?""", (sha,))
    rows = c.fetchall()
    if rows:
        FLAG_S = True
    c.execute("""SELECT sha1_hash from MalDet_B
    where sha1_hash=?""", (sha,))
    rows1 = c.fetchall()
    if rows1:
        FLAG_B = True
    if FLAG_S and mode != '2':
        print('existing file, searching...')
        c.execute("""SELECT * from MalDet_S
    where sha_1=?""", (sha,))
        rows = c.fetchall()
        conn.commit()
        conn.close()
        return unit_pack(rows, mode)
    elif FLAG_B and mode == '2':
        print('existing file, searching...')
        readBlob(sha)
    else:
        print("new file, processing...")
        if mfl == True:
            filepath = filename
        else:
            filepath = os.path.join(UPLOAD_DIRECTORY, filename)
        if not FLAG_S:
            if os.path.getsize(filepath) > 1500000:
                data = open(filepath, "rb").read()
                prob = ember.predict_sample(ember_model, data)
                if prob >= 0.5:
                    result = ('malware', prob)
                else:
                    result = ('benign', 1-prob)
            else:
                picture = createRGBImage(filepath)
                result = predict_image(tf_model, picture)
                os.remove(picture)
            if '/' in filename:
                filename = filename.split('/')[-1]
            matches = search_for_matches(c, filename, ssdeep_hash, imphash, sha)
            data_tuple = (filename,result[0],
                          int(result[1]*1000)/1000,sha, imphash, ssdeep_hash,
                          datetime.now().strftime("%d %B %Y"), ip, matches)
            query = """INSERT INTO MalDet_S
                           (filename, filetype, type_probability, 
                           sha_1, imphash, ssdeep, analysis_date, analyzer_ip, possible_matches) 
                            VALUES 
                           (?,?,?,?,?,?,?,?,?)"""
            c.execute(query, data_tuple)
        if mode != '2':
            os.remove(filepath)
            conn.commit()
            conn.close()
        if mode == '2':
            print('Sandbox here!!!')
            subprocess.call(['python3', 'sandbox.py', filepath])
            try:
                with open('result.zip', 'rb') as f:
                    blobData = f.read()
                query = """INSERT INTO MalDet_B
                       (filename, sha1_hash, 
                       result, analysis_date, analyzer_ip) 
                        VALUES 
                       (?,?,?,?,?)"""
                data_tuple_2 = (filename.split('/')[-1], sha, 
                            blobData, datetime.now().strftime("%d %B %Y"), ip)
                c.execute(query, data_tuple_2)
                os.remove(filepath)
                conn.commit()
                conn.close()
            except Exception as e:
                shutil.rmtree('files')
                os.mkdir('files')
        elif mode == '1':
            info = dict()
            info['name'] = data_tuple[0]
            info['type'] = data_tuple[1]
            info['probability'] = int(data_tuple[2]*1000)/1000
            info['sha1'] = data_tuple[3]
            info['imphash'] = data_tuple[4]
            info['ssdeep'] = data_tuple[5]
            info['date'] = data_tuple[6]
            info['source_ip'] = data_tuple[7]
            info['matches'] = data_tuple[8]
            return info
        elif mode == '0':
            info = dict()
            info['name'] = data_tuple[0]
            info['type'] = data_tuple[1]
            return info