Exemplo n.º 1
0
    def start(self):
        # perform some logging
        self.jlogger.info("Starting job with job id {}".format(self.job_id))
        self.jlogger.debug("Job Config: {}".format(self.config))
        self.jlogger.debug("Job Other Data: {}".format(self.job_data))

        try:
            rud.ReadUserData(self)
            fg.FeatureGeneration(self, is_train=True)
            pp.Preprocessing(self, is_train=True)
            fs.FeatureSelection(self, is_train=True)
            fe.FeatureExtraction(self, is_train=True)
            clf.Classification(self)
            cv.CrossValidation(self)
            tsg.TestSetGeneration(self)
            tspp.TestSetPreprocessing(self)
            tsprd.TestSetPrediction(self)
            job_success_status = True
        except:
            job_success_status = False
            helper.update_running_job_status(self.job_id, "Errored")
            self.jlogger.exception("Exception occurred in ML Job {} ".format(
                self.job_id))

        return job_success_status
    def get_data_splits(self, cv_method):
        cv_data_splits = []

        x = self.ml_pipeline.data.values
        y = self.ml_pipeline.data_labels.values.ravel()

        i = 1
        for train_index, test_index in cv_method.split(x, y):
            x_train, x_test = x[train_index], x[test_index]
            y_train, y_test = y[train_index], y[test_index]

            x_train_pd = pd.DataFrame(x_train)
            x_test_pd = pd.DataFrame(x_test)

            ppp_ml_pipeline = MLPipeline.MLPipeline(self.ml_pipeline.job_id)
            ppp_ml_pipeline.x_train = x_train_pd
            ppp_ml_pipeline.y_train = y_train
            ppp_ml_pipeline.x_test = x_test_pd
            ppp_ml_pipeline.y_test = y_test

            pp = ppp.Preprocessing(ppp_ml_pipeline, is_train=False)
            pp.preprocess_data()

            fs = pfs.FeatureSelection(ppp_ml_pipeline, is_train=False)
            fs.perform_feature_selection()

            fe = pfe.FeatureExtraction(ppp_ml_pipeline, is_train=False)
            fe.perform_feature_extraction()

            self.jlogger.info("Cross validation split number {}".format(i))
            self.jlogger.info("XTrain Shape: {}".format(
                ppp_ml_pipeline.x_train.shape))
            self.jlogger.info("XTest Shape: {}".format(
                ppp_ml_pipeline.x_test.shape))
            self.jlogger.info("YTrain Shape: {}".format(
                ppp_ml_pipeline.y_train.shape))
            self.jlogger.info("YTest Shape: {}".format(
                ppp_ml_pipeline.y_test.shape))

            cv_data_splits.append(
                (ppp_ml_pipeline.x_train, ppp_ml_pipeline.x_test,
                 ppp_ml_pipeline.y_train, ppp_ml_pipeline.y_test))

            i += 1

        return cv_data_splits
Exemplo n.º 3
0
def do_preprocessing(pos_path,
                     neg_path,
                     selected_DB,
                     is_bigrams,
                     k=None,
                     method=None,
                     features_space=None):
    f2r = FileToReview.FileToReview(pos_path, neg_path, selected_DB)
    pos_reviews, neg_reviews = f2r.buildReviewMatrix()

    # get a new instance for preprocessing
    # The new instance needs to know where positive and negative review directories are, also database no
    prep = Preprocessing(pos_path, neg_path, selected_DB, pos_reviews,
                         neg_reviews, is_bigrams)

    # extract positive and negative vocabularies
    prep.extract_vocabulary()
    # print extracted vocabularies in dictionnary (json) format
    vocabs = prep.get_v()

    nb_neg_review = prep.get_nb_neg_review()
    nb_pos_review = prep.get_nb_pos_review()

    # get a new instance
    # The new instance needs to know where positive and negative review directories are, also database no
    tfp = TermFrequencyProcessing.TermFrequencyProcessing(
        pos_path, neg_path, selected_DB)
    tfp.compute_terms_frequency(vocabs)
    # print(tfp.get_overall_terms_frequency())
    # print(tfp.get_reviews_info())
    T = tfp.get_overall_terms_frequency()

    fs = FeatureSelection.FeatureSelection(T, nb_neg_review, nb_pos_review)

    if not features_space:
        features_space = fs.build_features_space(k, method)
        reduced_vocabs = fs.reduce_vocabs(vocabs, features_space)

        return vocabs, reduced_vocabs, fs, features_space

    reduced_vocabs = fs.reduce_vocabs(vocabs, features_space)
    return vocabs, reduced_vocabs, fs
'''
""" This script will read all the emails and it will train the classifier """

import os
from Email import *
from FeatureSelection import *
from NaiveBayesClassifier import *

trainPath = "dataset"
trainSet_emails = []

#create an email for every file we read
for f in os.listdir(trainPath):
    fileName = trainPath + '/' + f
    e = Email()
    if "spm" in fileName:
        e.setCategory("SPAM")
    else:
        e.setCategory("HAM")
    e.read(fileName)
    #insert the email we created to a collection of emails
    trainSet_emails.append(e)

#select features from our training set(automatic feature selection)
fs = FeatureSelection(trainSet_emails)
fs.selectFeatures()

#create a naive bayes classifier and train it
nb = NaiveBayesClassifier()
nb.setEmails(trainSet_emails)
nb.train()
Exemplo n.º 5
0
import sys
import LiveBodyAnalysis as LBA
import FeatureExploration as FE
import SegmentAnalysis as SA
import FeatureSelection as FS
import ModelAccuracy as MA

LBA.LiveBodyAnalysis()
FE.FeatureExploration()
FS.FeatureSelection()
MA.ModelAccuracy()