def extract_features(pdfs_ben, pdfs_mal, csv_name): feat_vecs = [] labels = [] file_names = [] # Extract malicious and benign features pool = multiprocessing.Pool() for pdf, feats in pool.imap(get_features, pdfs_mal): if feats is not None: feat_vecs.append(feats) labels.append(1.0) file_names.append(pdf) for pdf, feats in pool.imap(get_features, pdfs_ben): if feats is not None: feat_vecs.append(feats) labels.append(0.0) file_names.append(pdf) # Convert the data points into numpy.array X = numpy.array(numpy.zeros( (len(feat_vecs), featureedit.FeatureDescriptor.get_feature_count())), dtype=numpy.float64, order='C') for i, v in enumerate(feat_vecs): X[i, :] = v # Write the resulting CSV file datasets.numpy2csv(csv_name, X, labels, file_names)
def extract_features(pdfs_ben, pdfs_mal, csv_name): feat_vecs = [] labels = [] file_names = [] # Extract malicious and benign features pool = multiprocessing.Pool() for pdf, feats in pool.imap(get_features, pdfs_mal): if feats is not None: feat_vecs.append(feats) labels.append(1.0) file_names.append(pdf) for pdf, feats in pool.imap(get_features, pdfs_ben): if feats is not None: feat_vecs.append(feats) labels.append(0.0) file_names.append(pdf) # Convert the data points into numpy.array X = numpy.array(numpy.zeros((len(feat_vecs), featureedit.FeatureDescriptor.get_feature_count())), dtype=numpy.float64, order='C') for i, v in enumerate(feat_vecs): X[i, :] = v # Write the resulting CSV file datasets.numpy2csv(csv_name, X, labels, file_names)
def fit(self, X, y): ''' Trains a new random forest classifier. ''' with _R_lock: with tempfile.NamedTemporaryFile() as tmpfile: datasets.numpy2csv(tmpfile, X, y) tmpfile.seek(0) # Read in the CSV file with the training samples, omitting the second column (filename) robjects.r('{train} <- read.csv("{csv}", header=TRUE, colClasses={cc})'.format(train=self.traindata_Rname, csv=tmpfile.name, cc=_r_colClasses)) # Train a random forest named myforest using 1000 decision trees with 33 variables sampled at each split robjects.r('{model} <- randomForest(x={train}[,-1], y={train}[,1], ntree=1000, mtry=43, importance=TRUE)'.format(model=self.model_Rname, train=self.traindata_Rname)) self.model_trained = True
def decision_function(self, X): ''' Classifies novel data points using a trained model. Returns a list of predictions, one per data point, giving the probability of the given data point belonging to the positive class. ''' assert self.model_trained, 'Must train a model before classification' with _R_lock: with tempfile.NamedTemporaryFile() as tmpfile: datasets.numpy2csv(tmpfile, X, numpy.zeros((X.shape[0],))) tmpfile.seek(0) # Read in the CSV file with the samples to be classified, omitting the second column (filename) robjects.r('{novel} <- read.csv("{csv}", header=TRUE, colClasses={cc})'.format(novel=self.noveldata_Rname, csv=tmpfile.name, cc=_r_colClasses)) # Classify the new data points robjects.r('{pred} <- predict({model}, {novel}, type="prob")'.format(pred=self.predictions_Rname, model=self.model_Rname, novel=self.noveldata_Rname)) predictions = list(robjects.r['{pred}'.format(pred=self.predictions_Rname)]) # The first half of predictions is for the negative class, so get rid of the second half predictions = predictions[len(predictions) / 2:] res = numpy.zeros((X.shape[0], 1)) for r, i in zip(predictions, range(X.shape[0])): res[i] = r return res