예제 #1
0
def predict():
    """
    If request is of GET method, render 'emailsubmit.html' template with value of template
    variable 'form' set to instance of 'InputForm'(defined in 'forms.py').
    Set the 'inputmodel' choices to names of models (in 'mlmodels' folder), with out extension i.e .pk

    If request is of POST method, perform the below checks

    1. If input emails is not provided either in text area or as a '.txt' file, render the same page with GET Request and below error message.
    'No Input: Provide a Single or Multiple Emails as Input.'

    2. If input is provided both in text area and as a file, render the same page with GET Request and below error message.
    'Two Inputs Provided: Provide Only One Input.'

    3. In case if input is provided as a '.txt' file, save the uploaded file into 'inputdata' folder and read the
     contents of file into a variable 'input_txt'

    4. If input provided in text area, capture the contents in the same variable 'input_txt'.

    5. validate 'input_txt', using 'validate_input_text' function defined above.

    6. If 'validate_input_text' returns False, render the same page with GET Request and below error message.
    'Unexpected Format : Input Text is not in Specified Format.'


    7. If 'validate_input_text' returns a Ordered dictionary, choose a model and perform prediction of each input email using 'predict' method defined in 'spamclassifier.py'

    8. If no input model is choosen, render the same page with GET Request and below error message.
    'Please Choose a single Model'

    9. Convert the ordered dictionary of predictions, with 0 and 1 values, to another ordered dictionary with values 'NOT SPAM' and 'SPAM' respectively.

    10. Save thus obtained predictions ordered dictionary into 'predictions.json' file.

    11. Render the template 'display_results'

    """
    if request.method == "GET":
        models_list = os.listdir(current_app.config['ML_MODEL_UPLOAD_FOLDER'])
        models_list = [
            model for model in models_list if 'word_features' not in model
        ]
        models_list = [(m.strip('.pk'), m.strip('.pk')) for m in models_list]

        form = InputForm()
        form.inputmodel.choices = models_list

        return render_template('emailsubmit.html', form=form)

    elif request.method == "POST":

        inputemail = request.form.get('inputemail')
        inputfile = request.files.get('inputfile')
        inputmodel = request.form.get('inputmodel')
        input_txt = []

        if inputemail == '' and inputfile.filename == '':
            flash('No Input: Provide a Single or Multiple Emails as Input.')
            return redirect(request.url)
        elif inputemail != '' and inputfile.filename != '':
            flash('Two Inputs Provided: Provide Only One Input.')
            return redirect(request.url)
        else:
            if inputemail != '':
                input_txt = inputemail
            elif inputfile.filename != '' and allowed_file(
                    inputfile.filename, ['txt']):
                txtfilename = secure_filename(inputfile.filename)
                txtfilepath = current_app.config['INPUT_DATA_UPLOAD_FOLDER']
                txt_file_withpath = os.path.join(txtfilepath, txtfilename)
                inputfile.save(txt_file_withpath)

                with open(txt_file_withpath, "r") as f:
                    input_txt = f.read()

            if not validate_input_text(input_txt):
                flash(
                    'Unexpected Format : Input Text is not in Specified Format.'
                )
                return redirect(request.url)
            elif inputmodel is None:
                flash('Please Choose a single Model')
                return redirect(request.url)
            else:
                x = validate_input_text(input_txt)
                p = spamclassifier.SpamClassifier()
                p.load_model(inputmodel)
                r = p.predict(x)

                if isinstance(r, list):
                    pred = []
                    for label in list(r):
                        if label == 0:
                            label = "NOT SPAM"
                        elif label == 1:
                            label = "SPAM"
                        pred.append(label)

                if isinstance(r, OrderedDict):
                    pred = OrderedDict()
                    for key, value in r.items():
                        if value == 0:
                            value = "NOT SPAM"
                        elif value == 1:
                            value = "SPAM"
                        pred[key] = value

                with open(
                        os.path.join(
                            current_app.config['INPUT_DATA_UPLOAD_FOLDER'],
                            'predictions.json'), 'w') as fp:
                    json.dump(pred, fp)

                return render_template('displayresults.html',
                                       predictions=pred.items())
예제 #2
0
def predict():
    '''
    If request is of GET method, render 'emailsubmit.html' template with value of template
    variable 'form' set to instance of 'InputForm'(defined in 'forms.py'). 
    Set the 'inputmodel' choices to names of models (in 'mlmodels' folder), with out extension i.e .pk
    
    If request is of POST method, perform the below checks
    
    1. If input emails is not provided either in text area or as a '.txt' file, render the same page with GET Request and below error message.
    'No Input: Provide a Single or Multiple Emails as Input.' 
    
    2. If input is provided both in text area and as a file, render the same page with GET Request and below error message.
    'Two Inputs Provided: Provide Only One Input.'
    
    3. In case if input is provided as a '.txt' file, save the uploaded file into 'inputdata' folder and read the
     contents of file into a variable 'input_txt'
    
    4. If input provided in text area, capture the contents in the same variable 'input_txt'.
    
    5. validate 'input_txt', using 'validate_input_text' function defined above.
    
    6. If 'validate_input_text' returns False, render the same page with GET Request and below error message.
    'Unexpected Format : Input Text is not in Specified Format.'

    
    7. If 'validate_input_text' returns a Ordered dictionary, choose a model and perform prediction of each input email using 'predict' method defined in 'spamclassifier.py'
    
    8. If no input model is choosen, render the same page with GET Request and below error message.
    'Please Choose a single Model'
    
    9. Convert the ordered dictionary of predictions, with 0 and 1 values, to another ordered dictionary with values 'NOT SPAM' and 'SPAM' respectively.
    
    10. Save thus obtained predictions ordered dictionary into 'predictions.json' file.
    
    11. Render the template 'display_results'
    
    '''
    if request.method == "GET":
        files = os.listdir(app.config['ML_MODEL_UPLOAD_FOLDER'])
        form = InputForm()
        #form.inputmodel = RadioField("Model File", choices=files) defined in forms

        return render_template("emailsubmit.html", form=form, inputmodel=files)
    elif request.method == "POST":
        form = InputForm()
        try:
            if not request.files["inputfile"] and not request.form.get(
                    "inputemail"
            ):  ################ This is how you get files from request objects in flask
                flash(
                    "No Input: Provide a Single or Multiple Emails as Input.")
                return redirect(request.url)
            elif request.files["inputfile"] and request.form.get("inputemail"):
                flash("Two Inputs Provided: Provide Only One Input.")
                return redirect(request.url)
            elif request.form.get("inputemail"):
                input_text = request.form.get("inputemail")
                if validate_input_text(input_text):
                    text = validate_input_text(input_text)
                    ## PREDICT here
                    if request.form.get("inputmodel"):
                        model_name = request.form.get("inputmodel")
                        classifier = spamclassifier.SpamClassifier()
                        classifier.load_model(
                            model_name
                        )  #model = classifier.load_model(model_name)
                        result = classifier.predict(text)
                        pred = OrderedDict()
                        for email, prediction in result.items():
                            if prediction == 0:
                                pred[email] = "NOT SPAM"
                            else:
                                pred[email] = "SPAM"
                        with open("predictions.json", "w") as f:
                            f.write(json.dumps(pred))
                        return display_results()

                    else:
                        flash('Please Choose a single Model')
                        return redirect(request.url)

                else:
                    flash(
                        'Unexpected Format : Input Text is not in Specified Format.'
                    )
                    return redirect(request.url)

            elif request.files[
                    "inputfile"]:  ################ This is how you get files from request objects in flask
                email_file = request.files["inputfile"]
                filename = secure_filename(email_file.filename)
                email_file.save(
                    os.path.join(app.config['UPLOAD_FOLDER'], filename))
                with open(app.config['UPLOAD_FOLDER'] + filename) as f:
                    input_text = f.read()
                    if validate_input_text(input_text):
                        text = validate_input_text(input_text)
                        ## PREDICT here
                        if request.form.get("inputmodel"):
                            model_name = request.form.get("inputmodel")
                            classifier = spamclassifier.SpamClassifier()
                            classifier.load_model(
                                model_name
                            )  #model = classifier.load_model(model_name)
                            result = classifier.predict(text)
                            pred = OrderedDict()
                            for email, prediction in result.items():
                                if prediction == 0:
                                    pred[email] = "NOT SPAM"
                                else:
                                    pred[email] = "SPAM"
                            with open("predictions.json", "w") as f:
                                f.write(json.dumps(pred))
                            return display_results()

                        else:
                            flash('Please Choose a single Model')
                            return redirect(request.url)
                    else:
                        flash(
                            'Unexpected Format : Input Text is not in Specified Format.'
                        )
                        return redirect(request.url)

            #return "Get me trained"
        except Exception as e:
            flash(
                "Something else entirely went wrong in Predict, Heres the error - \n Check in debugger "
                + str(e))
            return redirect(request.url)
예제 #3
0
def train_dataset():
    """
    If request is of GET method, render 'train.html' template with template variable 'train_files',
    set to list if csv files present in 'inputdata' folder.

    If request is of POST method, capture values associated with
    'train_file', 'train_size', 'random_state', and 'shuffle'

    if no 'train_file' is selected, render the same page with GET Request and below error message.
    'No CSV file is selected'

    if 'train_size' is None, render the same page with GET Request and below error message.
    'No value provided for size of training data set.'

    if 'train_size' value is not float, render the same page with GET Request and below error message.
    'Training Data Set Size must be a float.

    if 'train_size' value is not in between 0.0 and 1.0, render the same page with GET Request and below error message.
    'Training Data Set Size Value must be in between 0.0 and 1.0'

    if 'random_state' is None,render the same page with GET Request and below error message.
    'No value provided for random state.''

    if 'random_state' value is not an integer, render the same page with GET Request and below error message.
    'Random State must be an integer.'

    if 'shuffle' is None, render the same page with GET Request and below error message.
    'No option for shuffle is selected.'

    if 'shuffle' is set to 'No' when 'Startify' is set to 'Yes', render the same page with GET Request and below error message.
    'When Shuffle is No, Startify cannot be Yes.'

    If all input values are valid, build the model using submitted paramters and methods defined in
    'spamclassifier.py' and save the model and model word features file in 'mlmodels' folder.

    NOTE: These models are generated from uploaded CSV files, present in 'inputdata'.
    So if ur csv file names is 'sample.csv', then when you generate model
    two files 'sample.pk' and 'sample_word_features.pk' will be generated.

    Finally render, 'display_models' template with value of template varaible 'success_model'
    set to name of model generated, ie. 'sample.pk'
    """

    if request.method == "GET":

        files_list = os.listdir(current_app.config['INPUT_DATA_UPLOAD_FOLDER'])
        file_names = [f for f in files_list if '.csv' in f]
        return render_template('train.html', train_files=file_names)

    elif request.method == "POST":

        train_file = request.form.get('train_file')
        train_size = request.form.get('train_size')
        random_state = request.form.get('random_state')
        shuffle = request.form.get('shuffle')
        stratify = request.form.get('stratify')

        if train_file is None:
            flash('No CSV file is selected')
            return redirect(request.url)
        elif train_size is None or train_size == '' or 'train_size' not in request.form:
            flash('No value provided for size of training data set.')
            return redirect(request.url)
        elif not isFloat(train_size):
            flash('Training Data Set Size must be a float.')
            return redirect(request.url)
        else:
            train_size = float(train_size)
            if train_size <= 0.0 or train_size >= 1.0:
                flash(
                    'Training Data Set Size Value must be in between 0.0 and 1.0'
                )
                return redirect(request.url)
            elif random_state is None or random_state == '' or 'random_state' not in request.form:
                flash('No value provided for random state.')
                return redirect(request.url)
            elif not isInt(random_state):
                flash('Random State must be an integer.')
                return redirect(request.url)
            elif shuffle is None:
                flash('No option for shuffle is selected.')
                return redirect(request.url)
            elif shuffle == 'N' and stratify == 'Y':
                flash('When Shuffle is No, Startify cannot be Yes.')
                return redirect(request.url)
            else:

                data = pd.read_csv(
                    os.path.join(
                        current_app.config['INPUT_DATA_UPLOAD_FOLDER'],
                        train_file))

                test_size = 1.0 - train_size

                random_state = int(random_state)

                if shuffle == "Y":
                    shuffle = True
                else:
                    shuffle = False

                if stratify == "Y":
                    stratify = data["spam"].values
                else:
                    stratify = None

                train_X, test_X, train_Y, test_Y = train_test_split(
                    data["text"].values,
                    data["spam"].values,
                    test_size=test_size,
                    random_state=random_state,
                    shuffle=shuffle,
                    stratify=stratify)
                classifier = spamclassifier.SpamClassifier()
                classifier_model, model_word_features = classifier.train(
                    train_X, train_Y)
                model_name = train_file.replace('.csv', '.pk')
                model_word_features_name = train_file.replace(
                    '.csv', '_word_features.pk')
                with open(
                        os.path.join(
                            current_app.config['ML_MODEL_UPLOAD_FOLDER'],
                            model_name), 'wb') as model_fp:
                    pickle.dump(classifier_model, model_fp)
                with open(
                        os.path.join(
                            current_app.config['ML_MODEL_UPLOAD_FOLDER'],
                            model_word_features_name), 'wb') as model_fp:
                    pickle.dump(model_word_features, model_fp)

                return display_models(model_name)
예제 #4
0
def train_dataset():
    '''
    If request is of GET method, render 'train.html' template with tempalte variable 'train_files',
    set to list if csv files present in 'inputdata' folder.
    
    If request is of POST method, capture values associated with
    'train_file', 'train_size', 'random_state', and 'shuffle'
    
    if no 'train_file' is selected, render the same page with GET Request and below error message.
    'No CSV file is selected'
    
    if 'train_size' is None, render the same page with GET Request and below error message.
    'No value provided for size of training data set.'
    
    if 'train_size' value is not float, render the same page with GET Request and below error message.
    'Training Data Set Size must be a float.
    
    if 'train_size' value is not in between 0.0 and 1.0, render the same page with GET Request and below error message.
    'Training Data Set Size Value must be in between 0.0 and 1.0' 
    
    if 'random_state' is None,render the same page with GET Request and below error message.
    'No value provided for random state.'
    
    if 'random_state' value is not an integer, render the same page with GET Request and below error message.
    'Random State must be an integer.'
    
    if 'shuffle' is None, render the same page with GET Request and below error message.
    'No option for shuffle is selected.'
    
    if 'shuffle' is set to 'No' when 'Startify' is set to 'Yes', render the same page with GET Request and below error message.
    'When Shuffle is No, Startify cannot be Yes.'
    
    If all input values are valid, build the model using submitted paramters and methods defined in
    'spamclassifier.py' and save the model and model word features file in 'mlmodels' folder.
    
    NOTE: These models are generated from uploaded CSV files, present in 'inputdata'.
    So if ur csv file names is 'sample.csv', then when you generate model
    two files 'sample.pk' and 'sample_word_features.pk' will be generated.
    
    Finally render, 'display_models' template with value of template varaible 'success_model' 
    set to name of model generated, ie. 'sample.pk'
    '''
    if request.method == "GET":
        files = os.listdir(app.config['UPLOAD_FOLDER'])
        return render_template("train.html", train_files=files)
    elif request.method == "POST":
        try:
            train_file = request.form.get(
                "train_file"
            )  # request is a dictionary, which has bunch of stuff in it # this is just the file name-NOT actual path
            train_size = request.form.get("train_size")  # request.form
            random_state = int(request.form.get("random_state"))
            shuffle = request.form.get("shuffle")
            stratify = request.form.get(
                "stratify"
            )  #https://stackoverflow.com/questions/10434599/get-the-data-received-in-a-flask-request
            float(train_size)
        except ValueError:
            flash("only numbers allowed for random state and train size")
            return redirect(request.url)
        if train_file == None:
            flash('No CSV file is selected')
            return redirect(request.url)
        elif train_size == None:
            flash('No value provided for size of training data set.')
            return redirect(request.url)
        elif not 0.0 < float(train_size) < 1:
            flash(
                'Training Data Set Size Value must be in between 0.0 and 1.0')
            return redirect(request.url)
        elif random_state == None:
            flash('No value provided for random state.')
            return redirect(request.url)
        elif type(random_state) != int:
            flash('Random State must be an integer.')
            return redirect(request.url)
        elif shuffle == None:
            flash('No option for shuffle is selected.')
            return redirect(request.url)
        elif shuffle == "N" and stratify == "Y":
            flash('When Shuffle is No, Startify cannot be Yes.')
            return redirect(request.url)
        else:
            # TRAIN MODEL AND RETUN MODEL NAME
            try:
                data = pd.read_csv(app.config['UPLOAD_FOLDER'] + train_file)
                train_X, test_X, train_Y, test_Y = train_test_split(
                    data["text"].values,
                    data["spam"].values,
                    test_size=0.25,
                    random_state=random_state,
                    shuffle=shuffle,
                    stratify=data["spam"].values)
                classifier = spamclassifier.SpamClassifier()
                classifier_model, model_word_features = classifier.train(
                    train_X, train_Y)
                model_name = str(train_file).rsplit('.')[0] + ".pk"
                model_word_features_name = str(train_file).rsplit(
                    '.')[0] + "_word_features.pk"
                with open(app.config['ML_MODEL_UPLOAD_FOLDER'] + model_name,
                          'wb') as model_fp:
                    pickle.dump(classifier_model, model_fp)
                with open(
                        app.config["ML_MODEL_UPLOAD_FOLDER"] +
                        model_word_features_name, 'wb') as model_fp:
                    pickle.dump(model_word_features, model_fp)
                    print("error here")
                return display_models(success_model=model_name)
            except Exception as e:
                flash(
                    "probably a bad training file, check data and try again. Below Error \n\n"
                    + str(e))
                return redirect(request.url)
예제 #5
0
def predict():
    '''
    If request is of GET method, render 'emailsubmit.html' template with value of template
    variable 'form' set to instance of 'InputForm'(defined in 'forms.py').
    Set the 'inputmodel' choices to names of models (in 'mlmodels' folder), with out extension i.e .pk

    If request is of POST method, perform the below checks

    1. If input emails is not provided either in text area or as a '.txt' file, render the same page with GET Request and below error message.
    'No Input: Provide a Single or Multiple Emails as Input.'

    2. If input is provided both in text area and as a file, render the same page with GET Request and below error message.
    'Two Inputs Provided: Provide Only One Input.'

    3. In case if input is provided as a '.txt' file, save the uploaded file into 'inputdata' folder and read the
     contents of file into a variable 'input_txt'

    4. If input provided in text area, capture the contents in the same variable 'input_txt'.

    5. validate 'input_txt', using 'validate_input_text' function defined above.

    6. If 'validate_input_text' returns False, render the same page with GET Request and below error message.
    'Unexpected Format : Input Text is not in Specified Format.'


    7. If 'validate_input_text' returns a Ordered dictionary, choose a model and perform prediction of each input email using 'predict' method defined in 'spamclassifier.py'

    8. If no input model is choosen, render the same page with GET Request and below error message.
    'Please Choose a single Model'

    9. Convert the ordered dictionary of predictions, with 0 and 1 values, to another ordered dictionary with values 'NOT SPAM' and 'SPAM' respectively.

    10. Save thus obtained predictions ordered dictionary into 'predictions.json' file.

    11. Render the template 'display_results'

    '''
    ListOfFiles = []
    entries = os.listdir(os.path.join(os.getcwd(), 'spamfilter/mlmodels/'))
    for entry in entries:
        if entry.find('word_features') == -1:
            name, ext = os.path.splitext(entry)
            if ext == '.pk':
                ListOfFiles.append((name, name))

    if request.method == 'GET':
        newForm = InputForm()
        newForm.inputmodel.choices = ListOfFiles
        print("predict get : ", ListOfFiles, newForm.inputemail,
              newForm.inputfile)
        print("predict get : ", newForm.inputmodel.choices)
        return render_template('emailsubmit.html', form=newForm)
    else:
        print("predict 1", dict(request.form))
        inputEmail = request.form.get('inputemail')
        inputfile = request.files['inputfile']
        inputmodel = request.form.get('inputmodel')
        print("predict 2", inputEmail, inputmodel)
        if len(inputEmail) == 0 and len(inputfile.filename) == 0:
            flash('No Input: Provide a Single or Multiple Emails as Input.')
            return redirect(url_for('.predict'))
        elif len(inputEmail) != 0 and len(inputfile.filename) != 0:
            flash('Two Inputs Provided: Provide Only One Input.')
            return redirect(url_for('.predict'))

        elif len(inputEmail) == 0 and len(inputfile.filename) != 0:
            inputfile.save(
                os.path.join(os.path.join(os.getcwd(), 'spamfilter/mlmodels/'),
                             inputfile.filename))
            input_text = inputfile.read()
        elif len(inputEmail) != 0 and len(inputfile.filename) == 0:
            input_text = inputEmail

        ret_message = validate_input_text(input_text)
        pred_out = OrderedDict()
        print("predict 3", ret_message)
        if ret_message == False:
            flash('Unexpected Format : Input Text is not in Specified Format.')
            return redirect(url_for('.predict'))

        else:
            if inputmodel == None:
                flash('Please Choose a single Model')
                return redirect(url_for('.predict'))
            else:
                print("predict 4", ret_message)
                classifierclass = spamclassifier.SpamClassifier()
                classifierclass.classifier = pickle.load(
                    open(
                        os.path.join(
                            os.path.join(os.getcwd(), 'spamfilter/mlmodels/'),
                            inputmodel + '.pk'), 'rb'))
                classifierclass.word_features = pickle.load(
                    open(
                        os.path.join(
                            os.path.join(os.getcwd(), 'spamfilter/mlmodels/'),
                            inputmodel + '_word_features.pk'), 'rb'))
                pred_out = classifierclass.predict(ret_message)
                print("predict 5", pred_out)

                for man in pred_out:
                    if pred_out[man] == 0:
                        pred_out[man] = 'SPAM'
                    else:
                        pred_out[man] = 'NOT SPAM'

                with open('predictions.json', 'w') as fp:
                    json.dump(pred_out, fp)

                return display_results()
예제 #6
0
def train_dataset():
    '''
    If request is of GET method, render 'train.html' template with template variable 'train_files',
    set to list if csv files present in 'inputdata' folder.

    If request is of POST method, capture values associated with
    'train_file', 'train_size', 'random_state', and 'shuffle'

    if no 'train_file' is selected, render the same page with GET Request and below error message.
    'No CSV file is selected'

    if 'train_size' is None, render the same page with GET Request and below error message.
    'No value provided for size of training data set.'

    if 'train_size' value is not float, render the same page with GET Request and below error message.
    'Training Data Set Size must be a float.

    if 'train_size' value is not in between 0.0 and 1.0, render the same page with GET Request and below error message.
    'Training Data Set Size Value must be in between 0.0 and 1.0'

    if 'random_state' is None,render the same page with GET Request and below error message.
    'No value provided for random state.''

    if 'random_state' value is not an integer, render the same page with GET Request and below error message.
    'Random State must be an integer.'

    if 'shuffle' is None, render the same page with GET Request and below error message.
    'No option for shuffle is selected.'

    if 'shuffle' is set to 'No' when 'Startify' is set to 'Yes', render the same page with GET Request and below error message.
    'When Shuffle is No, Startify cannot be Yes.'

    If all input values are valid, build the model using submitted paramters and methods defined in
    'spamclassifier.py' and save the model and model word features file in 'mlmodels' folder.

    NOTE: These models are generated from uploaded CSV files, present in 'inputdata'.
    So if ur csv file names is 'sample.csv', then when you generate model
    two files 'sample.pk' and 'sample_word_features.pk' will be generated.

    Finally render, 'display_models' template with value of template varaible 'success_model'
    set to name of model generated, ie. 'sample.pk'
    '''
    #print("In train_dataset() ", dict(request.form),request.method, dict(request.files, ))
    #print("In train_dataset() ", dict(request.form),request.method, request.form.get('train_file'),request.form.get('random_state'))
    if request.method == 'GET':
        print("In train_dataset() GET ", os.getcwd())
        Input_directory = os.path.join(os.getcwd(), 'spamfilter/inputdata/')
        ListOfFiles = os.listdir(Input_directory)
        ListOfCsvFiles = []
        for file in ListOfFiles:
            name, ext = os.path.splitext(file)
            if ext == '.csv':
                ListOfCsvFiles.append(file)
        print("In train_dataset() GET ", Input_directory, ListOfCsvFiles)
        return render_template('train.html', train_files=ListOfCsvFiles)

    else:
        train_file = request.form.get('train_file')
        train_size = request.form.get('train_size')
        random_state = request.form.get('random_state')
        shuffle = request.form.get('shuffle')
        startify = request.form.get('stratify')

        print("In POST ", train_file, type(train_size), random_state, shuffle,
              startify)
        if train_file == None:
            flash('No CSV file is selected')
            print("train_file ", train_file)
            return redirect(request.url)

        if train_size == None:
            flash('No value provided for size of training data set.')
            print("train_size 1 ", train_size)
            return redirect(request.url)
        elif train_size.isalpha() == True:
            flash('Training Data Set Size must be a float.')
            print("train_size 1.1 ", train_size)
            return redirect(request.url)
        else:
            if isFloat(float(train_size)) == False:
                flash('Training Data Set Size must be a float.')
                print("train_size 2", train_size)
                return redirect(request.url)
            elif (float(train_size) < 0.0 or float(train_size) > 1.0):
                print(train_size)
                flash(
                    'Training Data Set Size Value must be in between 0.0 and 1.0'
                )
                print("train_size 3 ", train_size)
                return redirect(request.url)
                print('Random State', random_state)
            else:
                if random_state == None:
                    flash('No value provided for random state.')
                    print("random_state 1 ", random_state)
                    return redirect(request.url)
                elif isInt(int(random_state)) == False:
                    flash('Random State must be an integer.')
                    print("random_state 2", random_state)
                    return redirect(request.url)
                elif shuffle == None:
                    flash('No option for shuffle is selected.')
                    print("shuffle 1", shuffle)
                    return redirect(request.url)
                elif shuffle == 'No' and startify == 'Yes':
                    flash('When Shuffle is No, Startify cannot be Yes.')
                    print("shuffle 2", shuffle)
                    return redirect(request.url)
                else:
                    print("ALL WELL")
                    data = pd.read_csv(
                        os.path.join(
                            os.path.join(os.getcwd(), 'spamfilter/inputdata/'),
                            train_file))
                    train_X, test_X, train_Y, test_Y = train_test_split(
                        data["text"].values,
                        data["spam"].values,
                        train_size=float(train_size),
                        random_state=int(random_state))

                    classifier = spamclassifier.SpamClassifier()
                    classifier_model, model_word_features = classifier.train(
                        train_X, train_Y)
                    model_name = train_file.split('.')[0]
                    model_file = os.path.join(
                        os.path.join(os.getcwd(), 'spamfilter/mlmodels/'),
                        model_name + '.pk')
                    model_word_features_file = os.path.join(
                        os.path.join(os.getcwd(), 'spamfilter/mlmodels/'),
                        model_name + '_word_features.pk')
                    #model_word_features_name = 'sample_emails_word_features.pk'
                    with open(model_file, 'wb') as model_fp:
                        pickle.dump(classifier_model, model_fp)
                    with open(model_word_features_file, 'wb') as model_fp:
                        pickle.dump(model_word_features, model_fp)

                    return display_models(success_model=model_name + '.pk')
예제 #7
0
def predict():
    '''
    If request is of GET method, render 'emailsubmit.html' template with value of template
    variable 'form' set to instance of 'InputForm'(defined in 'forms.py').
    Set the 'inputmodel' choices to names of models (in 'mlmodels' folder), with out extension i.e .pk

    If request is of POST method, perform the below checks

    1. If input emails is not provided either in text area or as a '.txt' file, render the same page with GET Request and below error message.
    'No Input: Provide a Single or Multiple Emails as Input.'

    2. If input is provided both in text area and as a file, render the same page with GET Request and below error message.
    'Two Inputs Provided: Provide Only One Input.'

    3. In case if input is provided as a '.txt' file, save the uploaded file into 'inputdata' folder and read the
     contents of file into a variable 'input_txt'

    4. If input provided in text area, capture the contents in the same variable 'input_txt'.

    5. validate 'input_txt', using 'validate_input_text' function defined above.

    6. If 'validate_input_text' returns False, render the same page with GET Request and below error message.
    'Unexpected Format : Input Text is not in Specified Format.'

    7. If 'validate_input_text' returns a Ordered dictionary, choose a model and perform prediction of each input email using 'predict' method defined in 'spamclassifier.py'

    8. If no input model is choosen, render the same page with GET Request and below error message.
    'Please Choose a single Model'

    9. Convert the ordered dictionary of predictions, with 0 and 1 values, to another ordered dictionary with values 'NOT SPAM' and 'SPAM' respectively.

    10. Save thus obtained predictions ordered dictionary into 'predictions.json' file.

    11. Render the template 'display_results'
    '''
    form = InputForm()
    files = os.listdir(current_app.config['ML_MODEL_UPLOAD_FOLDER'])
    form.inputmodel.choices = [
        (model_file.rsplit('.', 1)[0], model_file.rsplit('.', 1)[0])
        for model_file in files
        if '_'.join(model_file.rsplit("_", 2)[-2:]) != "word_features.pk"
    ]
    if request.method == 'POST':
        inputemail = request.form.get('inputemail')
        if len(request.files) > 0:
            inputfile = request.files[form.inputfile.name]
        else:
            inputfile = None
        inputmodel = request.form.get('inputmodel')
        if (inputemail == '' or inputemail is None) and inputfile is None:
            flash('No Input: Provide a Single or Multiple Emails as Input.')
            return redirect(url_for('SpamAPI.predict'))
        if not (inputemail == ''
                or inputemail is None) and not inputfile is None:
            flash('Two Inputs Provided: Provide Only One Input.')
            return redirect(url_for('SpamAPI.predict'))
        if inputemail:
            input_txt = inputemail
        if inputfile is not None:
            inputfile_path = os.path.join(
                current_app.config['INPUT_DATA_UPLOAD_FOLDER'], 'input.txt')
            inputfile.save(inputfile_path)
            with open(inputfile_path, 'r') as fp:
                input_txt = fp.read()
        emails_dict = validate_input_text(input_txt)
        if not emails_dict:
            flash('Unexpected Format : Input Text is not in Specified Format.')
            return redirect(url_for('SpamAPI.predict'))
        if inputmodel is None:
            flash('Please Choose a single Model')
            return redirect(url_for('SpamAPI.predict'))
        sc = spamclassifier.SpamClassifier()
        sc.load_model(inputmodel.rsplit('.', 1)[0])
        emails_pred = sc.predict(emails_dict)
        emails_pred_final = OrderedDict()
        for email, pred in emails_pred.items():
            if pred:
                emails_pred_final[email] = 'SPAM'
            else:
                emails_pred_final[email] = 'NOT SPAM'
        predictions_fpath = os.path.join(
            current_app.config['INPUT_DATA_UPLOAD_FOLDER'], 'predictions.json')
        with open(predictions_fpath, 'w') as fp:
            json.dump(emails_pred_final, fp)
        return redirect(url_for('SpamAPI.display_results'))
    return render_template('emailsubmit.html', form=form)
예제 #8
0
def train_dataset():
    '''
    If request is of GET method, render 'train.html' template with tempalte variable 'train_files',
    set to list if csv files present in 'inputdata' folder.

    If request is of POST method, capture values associated with
    'train_file', 'train_size', 'random_state', and 'shuffle'

    if no 'train_file' is selected, render the same page with GET Request and below error message.
    'No CSV file is selected'

    if 'train_size' is None, render the same page with GET Request and below error message.
    'No value provided for size of training data set.'

    if 'train_size' value is not float, render the same page with GET Request and below error message.
    'Training Data Set Size must be a float.

    if 'train_size' value is not in between 0.0 and 1.0, render the same page with GET Request and below error message.
    'Training Data Set Size Value must be in between 0.0 and 1.0'

    if 'random_state' is None,render the same page with GET Request and below error message.
    'No value provided for random state.''

    if 'random_state' value is not an integer, render the same page with GET Request and below error message.
    'Random State must be an integer.'

    if 'shuffle' is None, render the same page with GET Request and below error message.
    'No option for shuffle is selected.'

    if 'shuffle' is set to 'No' when 'Startify' is set to 'Yes', render the same page with GET Request and below error message.
    'When Shuffle is No, Startify cannot be Yes.'

    If all input values are valid, build the model using submitted paramters and methods defined in
    'spamclassifier.py' and save the model and model word features file in 'mlmodels' folder.

    NOTE: These models are generated from uploaded CSV files, present in 'inputdata'.
    So if ur csv file names is 'sample.csv', then when you generate model
    two files 'sample.pk' and 'sample_word_features.pk' will be generated.

    Finally render, 'display_models' template with value of template varaible 'success_model'
    set to name of model generated, ie. 'sample.pk'
    '''
    train_files = [
        file
        for file in os.listdir(current_app.config['INPUT_DATA_UPLOAD_FOLDER'])
        if file.rsplit('.', 1)[-1].lower() == 'csv'
    ]
    if request.method == 'POST':
        if 'train_file' not in request.form:
            flash('No CSV file is selected')
            return redirect(request.url)
        train_file = request.form['train_file']
        train_size = request.form.get('train_size')
        if train_size is None:
            flash('No value provided for size of training data set.')
            return redirect(request.url)
        if not isFloat(train_size):
            flash('Training Data Set Size must be a float.')
            return redirect(request.url)
        train_size = float(train_size)
        if not (0.0 <= train_size <= 1.0):
            flash(
                'Training Data Set Size Value must be in between 0.0 and 1.0')
            return redirect(request.url)
        random_state = request.form.get('random_state')
        if random_state is None:
            flash('No value provided for random state.')
            return redirect(request.url)
        if not isInt(random_state):
            flash('Random State must be an integer.')
            return redirect(request.url)
        if 'shuffle' not in request.form:
            flash('No option for shuffle is selected.')
            return redirect(request.url)
        shuffle = request.form['shuffle']
        stratify = request.form['stratify']
        if shuffle == 'N' and stratify == 'Y':
            flash('When Shuffle is No, Startify cannot be Yes.')
            return redirect(request.url)
        shuffle = shuffle == 'Y'
        stratify = stratify == 'Y'
        df = pd.read_csv(
            os.path.join(current_app.config['INPUT_DATA_UPLOAD_FOLDER'],
                         train_file))
        if not shuffle:
            train_X, test_X, train_Y, test_Y = train_test_split(
                df["text"].values,
                df["spam"].values,
                train_size=train_size,
                shuffle=shuffle)
        if shuffle:
            if stratify:
                train_X, test_X, train_Y, test_Y = train_test_split(
                    df["text"].values,
                    df["spam"].values,
                    train_size=train_size,
                    shuffle=shuffle,
                    stratify=df["spam"].values)

            if not stratify:
                train_X, test_X, train_Y, test_Y = train_test_split(
                    df["text"].values,
                    df["spam"].values,
                    train_size=train_size,
                    shuffle=shuffle)
        classifier = spamclassifier.SpamClassifier()
        classifier_model, classifier_word_features = classifier.train(
            train_X, train_Y)
        train_file_name = train_file.rsplit(".", 1)[0]
        model_name = train_file_name + '.pk'
        model_word_features_name = train_file_name + '_word_features.pk'
        model_file = os.path.join(current_app.config['ML_MODEL_UPLOAD_FOLDER'],
                                  model_name)
        model_wf_file = os.path.join(
            current_app.config['ML_MODEL_UPLOAD_FOLDER'],
            model_word_features_name)
        with open(model_file, 'wb') as model_fp:
            pickle.dump(classifier_model, model_fp)
        with open(model_wf_file, 'wb') as model_wfp:
            pickle.dump(classifier_word_features, model_wfp)
        return redirect(
            url_for('SpamAPI.display_models', success_model=model_name))
    return render_template('train.html', train_files=train_files)