def predict_sample(lgbm_model, file_data, featurelist): """ Predict a PE file with an LightGBM model """ extractor = PEFeatureExtractor(featurelist) features = np.array(extractor.feature_vector(file_data), dtype=np.float32) return lgbm_model.predict([features])[0]
def predict_gbm(file_path): extractor = PEFeatureExtractor(feature_version=2) with open(file_path, 'rb') as f: file_data = f.read() features = np.array(extractor.feature_vector(file_data), dtype=np.float32) model_path = os.path.join('results', 'models', 'ember_model_2018.txt') model = lightgbm.Booster(model_file=model_path) score = model.predict([features])[0] return score
def vectorize(irow, raw_features_string, X_path, y_path, nrows, features, dim): """ Vectorize a single sample of raw features and write to a large numpy file """ extractor = PEFeatureExtractor(features, dim) raw_features = json.loads(raw_features_string) feature_vector = extractor.process_raw_features(raw_features) y = np.memmap(y_path, dtype=np.float32, mode="r+", shape=nrows) y[irow] = raw_features["label"] X = np.memmap(X_path, dtype=np.float32, mode="r+", shape=(nrows, dim)) X[irow] = feature_vector
def extract_features(self, sample): """ Extract features. If error is occured, return None Object """ extractor = PEFeatureExtractor(self.features) fullpath = os.path.join(os.path.join(self.datadir, sample)) try: binary = open(fullpath, 'rb').read() feature = extractor.raw_features(binary) feature.update({"sha256": sample}) # sample name(hash) feature.update({"label" : self.data[self.data.hash==sample].values[0][1]}) #label except KeyboardInterrupt: sys.exit() except Exception as e: print('errror exception') return None return feature
def extract_data(self, file_data, featurelist): if self.extractor == None: self.extractor = PEFeatureExtractor(featurelist) features = np.array(self.extractor.feature_vector(file_data), dtype=np.float32) return features
class Predictor: def __init__(self, testdir, features, output): # load model with pickle to predict self.testdir = testdir self.output = output self.features = features self.modellist = dict() self.extractor = None self.err = 0 def lgbmodel_load(self, modelpath): with open(modelpath, 'rb') as f: # cause error check? self.modellist["LGB"] = lgb.Booster(model_file=modelpath) def xgbmodel_load(self, modelpath): self.modellist["XGB"] = joblib.load(modelpath) def rfmodel_load(self, modelpath): self.modellist["RF"] = joblib.load(modelpath) def extract_data(self, file_data, featurelist): if self.extractor == None: self.extractor = PEFeatureExtractor(featurelist) features = np.array(self.extractor.feature_vector(file_data), dtype=np.float32) return features def predict_sample(self, modelname, features, y_list): try: y_list.append(self.modellist[modelname].predict(features.reshape(1,-1))[0]) except KeyboardInterrupt: sys.exit() except Exception as e: print(modelname+' error') print(e) y_list.append(0) self.err += 1 def run(self): lgby = [] name = [] end = len(next(os.walk(self.testdir))[2]) for sample in tqdm.tqdm(utility.directory_generator(self.testdir), total=end): fullpath = os.path.join(self.testdir, sample) if os.path.isfile(fullpath): binary = open(fullpath, "rb").read() name.append(format_spliter(sample)) features = self.extract_data(binary, self.features) self.predict_sample("LGB", features, lgby) #self.predict_sample("XGB", features, xgby) #self.predict_sample("RF", features, rfy) lgby = np.where(np.array(lgby) > 0.7, 1, 0) #other model already classifyed series = OrderedDict([ ('ID', name), ('Class', lgby), ]) r = pd.DataFrame.from_dict(series) r.to_csv(self.output, index=False)#, header=None) print('{} error is occured'.format(self.err))