def start(self): # perform some logging self.jlogger.info("Starting job with job id {}".format(self.job_id)) self.jlogger.debug("Job Config: {}".format(self.config)) self.jlogger.debug("Job Other Data: {}".format(self.job_data)) try: rud.ReadUserData(self) fg.FeatureGeneration(self, is_train=True) pp.Preprocessing(self, is_train=True) fs.FeatureSelection(self, is_train=True) fe.FeatureExtraction(self, is_train=True) clf.Classification(self) cv.CrossValidation(self) tsg.TestSetGeneration(self) tspp.TestSetPreprocessing(self) tsprd.TestSetPrediction(self) job_success_status = True except: job_success_status = False helper.update_running_job_status(self.job_id, "Errored") self.jlogger.exception("Exception occurred in ML Job {} ".format( self.job_id)) return job_success_status
def get_data_splits(self, cv_method): cv_data_splits = [] x = self.ml_pipeline.data.values y = self.ml_pipeline.data_labels.values.ravel() i = 1 for train_index, test_index in cv_method.split(x, y): x_train, x_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] x_train_pd = pd.DataFrame(x_train) x_test_pd = pd.DataFrame(x_test) ppp_ml_pipeline = MLPipeline.MLPipeline(self.ml_pipeline.job_id) ppp_ml_pipeline.x_train = x_train_pd ppp_ml_pipeline.y_train = y_train ppp_ml_pipeline.x_test = x_test_pd ppp_ml_pipeline.y_test = y_test pp = ppp.Preprocessing(ppp_ml_pipeline, is_train=False) pp.preprocess_data() fs = pfs.FeatureSelection(ppp_ml_pipeline, is_train=False) fs.perform_feature_selection() fe = pfe.FeatureExtraction(ppp_ml_pipeline, is_train=False) fe.perform_feature_extraction() self.jlogger.info("Cross validation split number {}".format(i)) self.jlogger.info("XTrain Shape: {}".format( ppp_ml_pipeline.x_train.shape)) self.jlogger.info("XTest Shape: {}".format( ppp_ml_pipeline.x_test.shape)) self.jlogger.info("YTrain Shape: {}".format( ppp_ml_pipeline.y_train.shape)) self.jlogger.info("YTest Shape: {}".format( ppp_ml_pipeline.y_test.shape)) cv_data_splits.append( (ppp_ml_pipeline.x_train, ppp_ml_pipeline.x_test, ppp_ml_pipeline.y_train, ppp_ml_pipeline.y_test)) i += 1 return cv_data_splits
def do_preprocessing(pos_path, neg_path, selected_DB, is_bigrams, k=None, method=None, features_space=None): f2r = FileToReview.FileToReview(pos_path, neg_path, selected_DB) pos_reviews, neg_reviews = f2r.buildReviewMatrix() # get a new instance for preprocessing # The new instance needs to know where positive and negative review directories are, also database no prep = Preprocessing(pos_path, neg_path, selected_DB, pos_reviews, neg_reviews, is_bigrams) # extract positive and negative vocabularies prep.extract_vocabulary() # print extracted vocabularies in dictionnary (json) format vocabs = prep.get_v() nb_neg_review = prep.get_nb_neg_review() nb_pos_review = prep.get_nb_pos_review() # get a new instance # The new instance needs to know where positive and negative review directories are, also database no tfp = TermFrequencyProcessing.TermFrequencyProcessing( pos_path, neg_path, selected_DB) tfp.compute_terms_frequency(vocabs) # print(tfp.get_overall_terms_frequency()) # print(tfp.get_reviews_info()) T = tfp.get_overall_terms_frequency() fs = FeatureSelection.FeatureSelection(T, nb_neg_review, nb_pos_review) if not features_space: features_space = fs.build_features_space(k, method) reduced_vocabs = fs.reduce_vocabs(vocabs, features_space) return vocabs, reduced_vocabs, fs, features_space reduced_vocabs = fs.reduce_vocabs(vocabs, features_space) return vocabs, reduced_vocabs, fs
''' """ This script will read all the emails and it will train the classifier """ import os from Email import * from FeatureSelection import * from NaiveBayesClassifier import * trainPath = "dataset" trainSet_emails = [] #create an email for every file we read for f in os.listdir(trainPath): fileName = trainPath + '/' + f e = Email() if "spm" in fileName: e.setCategory("SPAM") else: e.setCategory("HAM") e.read(fileName) #insert the email we created to a collection of emails trainSet_emails.append(e) #select features from our training set(automatic feature selection) fs = FeatureSelection(trainSet_emails) fs.selectFeatures() #create a naive bayes classifier and train it nb = NaiveBayesClassifier() nb.setEmails(trainSet_emails) nb.train()
import sys import LiveBodyAnalysis as LBA import FeatureExploration as FE import SegmentAnalysis as SA import FeatureSelection as FS import ModelAccuracy as MA LBA.LiveBodyAnalysis() FE.FeatureExploration() FS.FeatureSelection() MA.ModelAccuracy()