예제 #1
0
def predict_sample(lgbm_model, file_data, featurelist):
    """
    Predict a PE file with an LightGBM model
    """
    extractor = PEFeatureExtractor(featurelist)
    features = np.array(extractor.feature_vector(file_data), dtype=np.float32)
    return lgbm_model.predict([features])[0]
예제 #2
0
def predict_gbm(file_path):
    extractor = PEFeatureExtractor(feature_version=2)
    with open(file_path, 'rb') as f:
        file_data = f.read()
    features = np.array(extractor.feature_vector(file_data), dtype=np.float32)

    model_path = os.path.join('results', 'models', 'ember_model_2018.txt')
    model = lightgbm.Booster(model_file=model_path)

    score = model.predict([features])[0]
    return score
예제 #3
0
def vectorize(irow, raw_features_string, X_path, y_path, nrows, features, dim):
    """
    Vectorize a single sample of raw features and write to a large numpy file
    """
    extractor = PEFeatureExtractor(features, dim)
    raw_features = json.loads(raw_features_string)
    feature_vector = extractor.process_raw_features(raw_features)

    y = np.memmap(y_path, dtype=np.float32, mode="r+", shape=nrows)
    y[irow] = raw_features["label"]

    X = np.memmap(X_path, dtype=np.float32, mode="r+", shape=(nrows, dim))
    X[irow] = feature_vector
예제 #4
0
    def extract_features(self, sample):
        """
        Extract features.
        If error is occured, return None Object
        """
        extractor = PEFeatureExtractor(self.features)
        fullpath = os.path.join(os.path.join(self.datadir, sample))
        try:
            binary = open(fullpath, 'rb').read()
            feature = extractor.raw_features(binary)
            feature.update({"sha256": sample}) # sample name(hash)
            feature.update({"label" : self.data[self.data.hash==sample].values[0][1]}) #label

        except KeyboardInterrupt:
            sys.exit()
        except Exception as e:  
            print('errror exception')            

            return None

        return feature
예제 #5
0
 def extract_data(self, file_data, featurelist):
     if self.extractor == None:
         self.extractor = PEFeatureExtractor(featurelist)
     features = np.array(self.extractor.feature_vector(file_data), dtype=np.float32)
     return features
예제 #6
0
class Predictor:
    def __init__(self, testdir, features, output):
        # load model with pickle to predict
        self.testdir = testdir
        self.output = output
        self.features = features
        self.modellist = dict()
        self.extractor = None
        self.err = 0
    
    def lgbmodel_load(self, modelpath):
        with open(modelpath, 'rb') as f: # cause error check?
            self.modellist["LGB"] = lgb.Booster(model_file=modelpath)

    def xgbmodel_load(self, modelpath):
        self.modellist["XGB"] = joblib.load(modelpath)
    
    def rfmodel_load(self, modelpath):
        self.modellist["RF"] = joblib.load(modelpath)
    
    def extract_data(self, file_data, featurelist):
        if self.extractor == None:
            self.extractor = PEFeatureExtractor(featurelist)
        features = np.array(self.extractor.feature_vector(file_data), dtype=np.float32)
        return features
    
    def predict_sample(self, modelname, features, y_list):
        try:
            y_list.append(self.modellist[modelname].predict(features.reshape(1,-1))[0])
        except KeyboardInterrupt:
            sys.exit()
        except Exception as e:
            print(modelname+' error')
            print(e)
            y_list.append(0)
            self.err += 1
    
    def run(self):
        lgby = []
        name = []
        end = len(next(os.walk(self.testdir))[2])
        

        for sample in tqdm.tqdm(utility.directory_generator(self.testdir), total=end):
            fullpath = os.path.join(self.testdir, sample)

            if os.path.isfile(fullpath):
                binary = open(fullpath, "rb").read()
                name.append(format_spliter(sample))
                features = self.extract_data(binary, self.features)
                self.predict_sample("LGB", features, lgby)
                #self.predict_sample("XGB", features, xgby)
                #self.predict_sample("RF", features, rfy)
            
        
        lgby = np.where(np.array(lgby) > 0.7, 1, 0)
        #other model already classifyed
        
        series = OrderedDict([
                    ('ID', name),
                    ('Class', lgby),
                            ])
        r = pd.DataFrame.from_dict(series)
        r.to_csv(self.output, index=False)#, header=None)
        
        print('{} error is occured'.format(self.err))