Пример #1
0
def runXGBoostRanker():
    print("####  Running: RunMe.runXGBoostRanker() ####")
    reader = HomeDepotReader()
    feature_df = reader.getBasicDataFrame(
        "../data/features_doc2vec_sense2vec_20170416.csv")

    feature_train_df = feature_df[:74067]
    feature_test_df = feature_df[74067:]

    feature_test_df.pop('relevance')

    soln_filename = '../data/solution.csv'
    soln_df = pd.read_csv(soln_filename,
                          delimiter=',',
                          low_memory=False,
                          encoding="ISO-8859-1")
    dp = DataPreprocessing()
    test_private_df = dp.getGoldTestSet(feature_test_df,
                                        soln_df,
                                        testsetoption='Private')
    test_public_df = dp.getGoldTestSet(feature_test_df,
                                       soln_df,
                                       testsetoption='Public')

    xgb = XGBoostRanker(feature_train_df)
    xgb.train_Regressor(feature_train_df)
    # xgb.gridSearch_Regressor(feature_train_df)

    # result_df = xgb.test_Model(test_public_df)
    result_df = xgb.test_Model(test_private_df)
Пример #2
0
class FingerprintAuthentication:
    def __init__(self):
        datadir = '..{0}data{0}images'.format(os.sep)
        resnetmodel_path = '..{0}model{0}resnet_model.h5'.format(os.sep)
        siamesemodel_path = '..{0}model{0}siamese_model.h5'.format(os.sep)

        self.dataPreprocessing = DataPreprocessing(datadir, resnetmodel_path)
        self.fingerprintModel = FingerprintModel(siamesemodel_path,
                                                 resnetmodel_path)
        self.keylist = list(self.dataPreprocessing.fingerprintDatabase.keys())

    def matchFingerprint(self, fingerprint, person_to_match=None):

        # no personid is specified, match with whole DB
        if person_to_match is None:
            siamesefeature_one, siamesefeature_two = self.dataPreprocessing.createTestData(
                fingerprint)
        else:
            template_DB_keylist = []
            siamesefeature_one, siamesefeature_two = self.dataPreprocessing.createTestDataWithPersonID(
                fingerprint, person_to_match, template_DB_keylist)
        match_index, matched_prob = self.fingerprintModel.predict(
            siamesefeature_one, siamesefeature_two)
        DBkeylist = self.keylist if person_to_match is None else template_DB_keylist
        person_id = DBkeylist[match_index].split('aug')[0].split('_')[0]
        matched_tif = DBkeylist[match_index]
        matched_person = self.dataPreprocessing.person_guid_map[person_id]
        return matched_tif, matched_prob, matched_person
Пример #3
0
    def train(self, trainDF, validateDF):
        print("+++++++++++++++++++++Training model...")
        # print("Generating new labels...")
        # dp=DataPreprocessing()
        # trainDF,validateDF=dp.transformLabels(trainDF=trainDF,validationDF=validateDF, newColName=self.yColDiscrete)
        # print("New labels generated...")

        print("Remove non trainable features...")
        self.xTrain = trainDF
        self.yTrain = trainDF[self.yColDiscrete]
        # self.xValidate=validateDF
        # self.yValidate=validateDF[self.yColDiscrete]

        # self.xTrain.drop('search_term', axis=1, inplace=True)
        # self.xTrain.drop('relevance', axis=1, inplace=True)
        if ('relevance_int' in self.xTrain):
            self.xTrain = self.xTrain.drop('relevance_int', axis=1)
        self.xTrain = self.xTrain.replace('inf', 99999)

        self.xTrain.drop('id', axis=1, inplace=True)
        self.xTrain.drop('search_term', axis=1, inplace=True)
        self.xTrain.drop('product_uid', axis=1, inplace=True)
        self.xTrain.drop('relevance', axis=1, inplace=True)
        self.xTrain.drop('product_idx', axis=1, inplace=True)
        self.xTrain.drop('Word2VecQueryExpansion', axis=1, inplace=True)
        self.xTrain.drop('len_search_term', axis=1, inplace=True)
        self.xTrain.drop('len_product_title', axis=1, inplace=True)

        # self.xTrain.drop('product_idx', axis=1, inplace=True)
        # self.xTrain.drop('Word2VecQueryExpansion', axis=1, inplace=True)

        # self.xValidate.drop('search_term', axis=1, inplace=True)
        # self.xValidate.drop('relevance', axis=1, inplace=True)
        # self.xValidate.drop('relevance_int', axis=1, inplace=True)
        # self.xValidate.drop('product_idx', axis=1, inplace=True)
        # self.xValidate.drop('Word2VecQueryExpansion', axis=1, inplace=True)

        print("+++++++++++++++++++++Training in progress")
        # print("self.xTrain:",list(self.xTrain))
        # print("self.yTrain:", list(self.yTrain))
        self.fittedModel = self.model.fit(self.xTrain, self.yTrain)
        self.yPred = self.fittedModel.predict(self.xTrain)
        # print("self.yPred:", self.yPred.shape, self.yPred[1:50, ])

        # print("self.yPred:", list(self.yPred))

        print("Converting to old labels")
        dp = DataPreprocessing()
        self.yTrain = dp.transformNewLabelToOld(self.yTrain.as_matrix())
        self.yPred = dp.transformNewLabelToOld(self.yPred)
        # print("self.yTrain:", self.yTrain.shape,self.yTrain[1:50,])
        # print("self.yPred:", self.yPred.shape, self.yPred[1:50, ])

        print("MSE:", mean_squared_error(self.yTrain, self.yPred))
        print("RMSE:", sqrt(mean_squared_error(self.yTrain, self.yPred)))
        # print("Accuracy:", accuracy_score(self.yTrain, self.yPred))
        # print("Precision:", precision_score(self.yTrain, self.yPred, average='micro'))
        # print("Recall:", recall_score(self.yTrain, self.yPred, average='micro'))
        # print("F1:", f1_score(self.yTrain, self.yPred, average='micro'))
        print("+++++++++++++++++++++Training completed")
Пример #4
0
    def __convertToCorpus(self, documents):
        """
        Steps to make the documents compatible to gensim
        Changelog
        - 15/3 KS First commit
        :param documents:
        :return:
        """
        #Preprocessing the text
        dp = DataPreprocessing()
        text = dp.getBagOfWords(documentDF=documents,
                                return_type='document_tokens')

        #Create a Gensim text corpus based on documents
        print("Creating a text dictionary")
        self.dictionary = Dictionary(line.lower().split()
                                     for line in documents)
        print(self.dictionary)
        print("Saving text dictionary to file")
        self.dictionary.save('../data.prune/producttext.dict')

        #Create a Gensim document corpus based on text corpus and each document
        print("Creating a Gensim document corpus")
        self.corpus = [self.dictionary.doc2bow(line) for line in text]

        print("Saving corpus to file")
        MmCorpus.serialize('../data.prune/productcorpus.mm', self.corpus)
        self.corpus = MmCorpus('../data.prune/productcorpus.mm')
        print(self.corpus)
Пример #5
0
    def validate(self,
                 testDF,
                 savePredictedFilename='../data/defaultPredictSave.csv'):
        print("+++++++++++++++++++++Validation start")
        print("Remove non trainable features...")

        savePredictedFilename = savePredictedFilename.split("csv")
        savePrediction = testDF['id'].as_matrix()
        print("Saveprediction=", savePrediction)
        self.xTest = testDF
        self.yTest = testDF[self.yColDiscrete]
        if ('relevance_int' in self.xTest):
            self.xTest = self.xTest.drop('relevance_int', axis=1)
        predictedDF = self.xTest

        self.xTest = self.xTest.replace('inf', 99999)
        self.xTest = self.xTest.drop('id', axis=1)
        self.xTest = self.xTest.drop([
            'search_term', 'product_uid', 'relevance', 'product_idx',
            'Word2VecQueryExpansion', 'len_search_term', 'len_product_title'
        ],
                                     axis=1)
        self.yPred = self.fittedModel.predict(self.xTest)
        predictedDF['relevance_int'] = self.yPred

        avgNDCG = NDCG_Eval().computeAvgNDCG(
            testDF, predictedDF, savePredictedFilename[0] + "nDCG" + ".csv")
        # print("avgNDCG:",avgNDCG)
        print("Converting to old labels")
        dp = DataPreprocessing()
        self.yTest = dp.transformNewLabelToOld(self.yTest)
        self.yPred = dp.transformNewLabelToOld(self.yPred)
        # print("self.yTest:", self.yTest.shape,self.yTest[1:50,])
        # print("self.yPred:", self.yPred.shape, self.yPred[1:50, ])

        savePrediction = pd.DataFrame(savePrediction, columns=['id'])
        ypredDF = pd.DataFrame(self.yPred, columns=['pred_relevance'])
        ypredDF.reset_index(drop=True)
        print("savePrediction.size:", savePrediction.size)
        print("savePrediction.size:", savePrediction.shape)
        print("ypredDF.size:", ypredDF.size)
        assert (savePrediction.size == ypredDF.size)
        # predictionResults=pd.concat([savePrediction,ypredDF],axis=1)
        predictionResults = savePrediction.join(ypredDF)
        predictionResults.to_csv(savePredictedFilename[0] + "csv", index=False)
        print("predictionResults.size:", predictionResults.shape)
        print("MSE:", mean_squared_error(self.yTest, self.yPred))
        print("RMSE:", sqrt(mean_squared_error(self.yTest, self.yPred)))
        writeResults = str(savePredictedFilename[0]
                           ) + "RMSE_NDCG_" + ".csv" + "\nRMSE:" + str(
                               sqrt(mean_squared_error(
                                   self.yTest,
                                   self.yPred))) + "\nNDCG:" + str(avgNDCG)
        file = open(savePredictedFilename[0] + "RMSE_NDCG_" + ".csv", 'w')
        file.write(writeResults)
        file.close()
        print("+++++++++++++++++++++Validation end")
Пример #6
0
    def __init__(self):
        datadir = '..{0}data{0}images'.format(os.sep)
        resnetmodel_path = '..{0}model{0}resnet_model.h5'.format(os.sep)
        siamesemodel_path = '..{0}model{0}siamese_model.h5'.format(os.sep)

        self.dataPreprocessing = DataPreprocessing(datadir, resnetmodel_path)
        self.fingerprintModel = FingerprintModel(siamesemodel_path,
                                                 resnetmodel_path)
        self.keylist = list(self.dataPreprocessing.fingerprintDatabase.keys())
Пример #7
0
def flask_button_click():
    # text = get text from flask text box
    proc = DataPreprocessing()
    load_data = proc.load_pickle("TokenizerData")
    predictor = Prediction(load_data)
    encoder_model = predictor.load_model('models\\encoder_model.json',
                                         'models\\encoder_model_weights.h5')
    decoder_model = predictor.load_model('models\\decoder_model.json',
                                         'models\\decoder_model_weights.h5')
    summary = predictor.generated_summaries(text, encoder_model, decoder_model)
Пример #8
0
 def oldLabelScorer(y, y_pred):
     # Custom scoring
     # print("Converting to old labels")
     dp = DataPreprocessing()
     y = dp.transformNewLabelToOld(y.as_matrix())
     y_pred = dp.transformNewLabelToOld(y_pred)
     # print("y:", y.shape, y[1:50, ])
     # print("y_pred:", y_pred.shape, y_pred[1:50, ])
     rmse = sqrt(mean_squared_error(y, y_pred))
     # print("RMSE:", rmse)
     return -rmse
Пример #9
0
def input():
    #  getting text from user

    if request.method=='POST':
        text=request.form.get("text_in")
    # creating summary
        proc = DataPreprocessing()
        load_data = proc.load_pickle("TokenizerData")
        predictor = Prediction(load_data)
        encoder_model = predictor.load_model('models/encoder_model.json', 'models/encoder_model_weights.h5')
        decoder_model = predictor.load_model('models/decoder_model.json', 'models/decoder_model_weights.h5')
        summary = predictor.generated_summaries(text, encoder_model, decoder_model)

    return render_template("summary.html",summary=summary)
Пример #10
0
def main():
    # split data
    training, validation, test = DataPreprocessing().split_data()
    print(type(training))
    df = DataPreprocessing().prepare_data_for_statistics("train")
    # apply word-based features
    WordBasedFeatures(df).features()
    # apply syntactic features
    SyntacticFeatures(df).features()
    SyntacticFeatures(df).outputter()
    # train model
    model = Classifier(df)

    print(model.predict(df))
Пример #11
0
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 12 13:23:12 2018

@author: shifuddin
"""

from sklearn.neural_network import MLPRegressor
from DataPreprocessing import DataPreprocessing

'''
Create object of preprocessing class
'''
preprocessing = DataPreprocessing('Position_Salaries.csv')

'''
Read features and outcome from csv file
'''
X, Y = preprocessing.read_csv(1,2,2)


'''
transforms and fit features and outcome
'''
X, Y = preprocessing.scale_data(X, Y)

'''
Create classifier from MLPClassifier
'''
regressor = MLPRegressor(hidden_layer_sizes=(100,50))
'''
Пример #12
0
 def __createAttributeColumn(self, product_df, attribute_df):
     dp = DataPreprocessing()
     attribute_doc_df = dp.getAttributeDoc(attribute_df)
     # attribute_doc_df
     return product_df.join(attribute_doc_df.set_index('product_uid'), on='product_uid')
Пример #13
0
    def __init__(self, model='ranf', _type="reg"):
        super().__init__(self, model)
        self.preprocessing = DataPreprocessing()
        # self.frameData = Agg_Frame_Data()
        self.model_type = model
        self.type = _type
        self.parameters = []
        self.all_models = []
        self.output_length = None
        self.output_columns = None
        self.feature_columns = None

        if self.model_type == 'ranf':
            self.model = RandomForestRegressor(warm_start=True,
                                               verbose=1,
                                               random_state=123)
            Unaided_Branding_params = {
                'bootstrap': False,
                'max_depth': 11,
                'max_features': 0.25,
                'min_samples_leaf': 2,
                'n_estimators': 100
            }
            Brand_mean_cues_params = {
                'bootstrap': False,
                'max_depth': 15,
                'max_features': 0.25,
                'min_samples_leaf': 2,
                'n_estimators': 250
            }
            Aided_Branding__Mean_params = {
                'bootstrap': False,
                'max_depth': 15,
                'max_features': 0.3,
                'min_samples_leaf': 2,
                'n_estimators': 100
            }
            Active_Involvement__Mean_params = {
                'bootstrap': False,
                'max_depth': 14,
                'max_features': 0.25,
                'min_samples_leaf': 2,
                'n_estimators': 300
            }
            New_Information__Mean_params = {
                'bootstrap': False,
                'max_depth': 13,
                'max_features': 0.4,
                'min_samples_leaf': 2,
                'n_estimators': 200
            }
            Enjoyment__Mean_params = {
                'bootstrap': False,
                'max_depth': 12,
                'max_features': 0.3,
                'min_samples_leaf': 2,
                'n_estimators': 300
            }
            Brand_Appeal__Mean_params = {
                'bootstrap': False,
                'max_depth': 12,
                'max_features': 0.3,
                'min_samples_leaf': 2,
                'n_estimators': 300
            }
            Understanding__Mean_params = {
                'bootstrap': False,
                'max_depth': 15,
                'max_features': 0.3,
                'min_samples_leaf': 2,
                'n_estimators': 150
            }
            Relevance_of_Information__Mean_params = {
                'bootstrap': False,
                'max_depth': 12,
                'max_features': 0.25,
                'min_samples_leaf': 2,
                'n_estimators': 150
            }
            Credibility_of_Information__Mean_params = {
                'bootstrap': False,
                'max_depth': 11,
                'max_features': 0.25,
                'min_samples_leaf': 2,
                'n_estimators': 100
            }
            Brand_Difference__Mean_params = {
                'bootstrap': False,
                'max_depth': 13,
                'max_features': 0.25,
                'min_samples_leaf': 2,
                'n_estimators': 400
            }
            Interest_peak_params = {
                'bootstrap': False,
                'max_depth': 13,
                'max_features': 0.3,
                'min_samples_leaf': 2,
                'n_estimators': 200
            }
            Interest_mean_params = {
                'bootstrap': False,
                'max_depth': 15,
                'max_features': 0.3,
                'min_samples_leaf': 2,
                'n_estimators': 200
            }
            Purchase_intent_params = {
                'bootstrap': False,
                'max_depth': 15,
                'max_features': 0.3,
                'min_samples_leaf': 2,
                'n_estimators': 400
            }
            Persuasion_mean_params = {
                'bootstrap': False,
                'max_depth': 13,
                'max_features': 0.3,
                'min_samples_leaf': 2,
                'n_estimators': 200
            }
            Persuasion_likely_params = {
                'bootstrap': False,
                'max_depth': 20,
                'max_features': 0.3,
                'min_samples_leaf': 2,
                'n_estimators': 400
            }
            Interest_frames_params = {
                'bootstrap': False,
                'max_depth': 20,
                'max_features': 0.3,
                'min_samples_leaf': 2,
                'n_estimators': 400
            }

        elif self.model_type == 'xgb':
            self.model = xgb.XGBRegressor(eta=0.3,
                                          save_period=1,
                                          random_state=123)
            #### RMSE was decreasing with increaasing n_estimators
            Unaided_Branding_params = {
                'colsample_bytree': 0.4,
                'max_depth': 10,
                'min_samples_leaf': 4,
                'n_estimators': 2000,
                "objective": 'reg:linear',
                "silent": False,
                "alpha": 1,
                "learning_rate": 0.01,
                "reg_lambda": 1
            }
            Brand_mean_cues_params = {
                'colsample_bytree': 0.2,
                'max_depth': 10,
                'min_samples_leaf': 2,
                'n_estimators': 1000,
                "objective": 'reg:linear',
                "silent": False,
                "alpha": 1,
                "learning_rate": 0.01,
                "reg_lambda": 1
            }
            Aided_Branding__Mean_params = {
                'colsample_bytree': 0.2,
                'max_depth': 15,
                'min_samples_leaf': 4,
                'n_estimators': 1000,
                "objective": 'reg:linear',
                "silent": False,
                "alpha": 1,
                "learning_rate": 0.01,
                "reg_lambda": 1
            }
            Active_Involvement__Mean_params = {
                'colsample_bytree': 0.6,
                'max_depth': 10,
                'min_samples_leaf': 2,
                'n_estimators': 900,
                "objective": 'reg:linear',
                "silent": False,
                "alpha": 1,
                "learning_rate": 0.01,
                "reg_lambda": 1
            }
            New_Information__Mean_params = {
                'colsample_bytree': 0.4,
                'max_depth': 10,
                'min_samples_leaf': 2,
                'n_estimators': 1000,
                "objective": 'reg:linear',
                "silent": False,
                "alpha": 10,
                "learning_rate": 0.01,
                "reg_lambda": 1
            }
            Enjoyment__Mean_params = {
                'colsample_bytree': 0.8,
                'max_depth': 10,
                'min_samples_leaf': 2,
                'n_estimators': 1000,
                "objective": 'reg:linear',
                "silent": False,
                "alpha": 1,
                "learning_rate": 0.01,
                "reg_lambda": 1
            }
            Brand_Appeal__Mean_params = {
                'colsample_bytree': 0.6,
                'max_depth': 10,
                'min_samples_leaf': 2,
                'n_estimators': 1000,
                "objective": 'reg:linear',
                "silent": False,
                "alpha": 1,
                "learning_rate": 0.01,
                "reg_lambda": 1
            }
            Understanding__Mean_params = {
                'colsample_bytree': 0.6,
                'max_depth': 10,
                'min_samples_leaf': 2,
                'n_estimators': 1000,
                "objective": 'reg:linear',
                "silent": False,
                "alpha": 1,
                "learning_rate": 0.01,
                "reg_lambda": 1
            }
            Relevance_of_Information__Mean_params = {
                'colsample_bytree': 0.2,
                'max_depth': 10,
                'min_samples_leaf': 2,
                'n_estimators': 1000,
                "objective": 'reg:linear',
                "silent": False,
                "alpha": 1,
                "learning_rate": 0.01,
                "reg_lambda": 1
            }
            Credibility_of_Information__Mean_params = {
                'colsample_bytree': 0.3,
                'max_depth': 10,
                'min_samples_leaf': 2,
                'n_estimators': 1000,
                "objective": 'reg:linear',
                "silent": False,
                "alpha": 1,
                "learning_rate": 0.01,
                "reg_lambda": 1
            }
            Brand_Difference__Mean_params = {
                'colsample_bytree': 0.2,
                'max_depth': 8,
                'min_samples_leaf': 4,
                'n_estimators': 1000,
                "objective": 'reg:linear',
                "silent": False,
                "alpha": 1,
                "learning_rate": 0.01,
                "reg_lambda": 1
            }
            Interest_peak_params = {
                'colsample_bytree': 0.2,
                'max_depth': 10,
                'min_samples_leaf': 4,
                'n_estimators': 1000,
                "objective": 'reg:linear',
                "silent": False,
                "alpha": 1,
                "learning_rate": 0.01,
                "reg_lambda": 1
            }
            Interest_mean_params = {
                'colsample_bytree': 0.2,
                'max_depth': 10,
                'min_samples_leaf': 4,
                'n_estimators': 1000,
                "objective": 'reg:linear',
                "silent": False,
                "alpha": 1,
                "learning_rate": 0.01,
                "reg_lambda": 1
            }
            Purchase_intent_params = {
                'colsample_bytree': 0.2,
                'max_depth': 10,
                'min_samples_leaf': 4,
                'n_estimators': 1000,
                "objective": 'reg:linear',
                "silent": False,
                "alpha": 1,
                "learning_rate": 0.01,
                "reg_lambda": 1
            }
            Persuasion_mean_params = {
                'colsample_bytree': 0.4,
                'max_depth': 10,
                'min_samples_leaf': 4,
                'n_estimators': 1000,
                "objective": 'reg:linear',
                "silent": False,
                "alpha": 1,
                "learning_rate": 0.01,
                "reg_lambda": 1
            }
            Persuasion_likely_params = {
                'colsample_bytree': 0.4,
                'max_depth': 10,
                'min_samples_leaf': 4,
                'n_estimators': 1000,
                "objective": 'reg:linear',
                "silent": False,
                "alpha": 1,
                "learning_rate": 0.01,
                "reg_lambda": 1
            }
            Interest_frames_params = {
                'colsample_bytree': 0.4,
                'max_depth': 10,
                'min_samples_leaf': 4,
                'n_estimators': 1000,
                "objective": 'reg:linear',
                "silent": False,
                "alpha": 1,
                "learning_rate": 0.01,
                "reg_lambda": 1
            }

        elif self.model_type == 'DT':
            self.model = DecisionTreeRegressor(random_state=123)
            Unaided_Branding_params = {
                'splitter': 'best',
                'max_depth': 20,
                'max_features': 0.6,
                'min_samples_leaf': 2,
                'presort': True
            }
            Brand_mean_cues_params = {
                'splitter': 'best',
                'max_depth': 20,
                'max_features': 0.6,
                'min_samples_leaf': 2,
                'presort': True
            }
            Aided_Branding__Mean_params = {
                'splitter': 'best',
                'max_depth': 20,
                'max_features': 0.2,
                'min_samples_leaf': 2,
                'presort': True
            }
            Active_Involvement__Mean_params = {
                'max_depth': 20,
                'max_features': 0.3,
                'min_samples_leaf': 2,
                'presort': True
            }
            New_Information__Mean_params = {
                'max_depth': 20,
                'max_features': 0.2,
                'min_samples_leaf': 2,
                'presort': True
            }
            Enjoyment__Mean_params = {
                'max_depth': 20,
                'max_features': 0.3,
                'min_samples_leaf': 2,
                'presort': True
            }
            Brand_Appeal__Mean_params = {
                'max_depth': 20,
                'max_features': 0.3,
                'min_samples_leaf': 2,
                'presort': True
            }
            Understanding__Mean_params = {
                'max_depth': 30,
                'max_features': 0.3,
                'min_samples_leaf': 2,
                'presort': True
            }
            Relevance_of_Information__Mean_params = {
                'max_depth': 30,
                'max_features': 0.3,
                'min_samples_leaf': 2,
                'presort': True
            }
            Credibility_of_Information__Mean_params = {
                'max_depth': 30,
                'max_features': 0.2,
                'min_samples_leaf': 2,
                'presort': True
            }
            Brand_Difference__Mean_params = {
                'max_depth': 30,
                'max_features': 0.2,
                'min_samples_leaf': 2,
                'presort': True
            }
            Interest_peak_params = {
                'max_depth': 30,
                'max_features': 0.2,
                'min_samples_leaf': 2,
                'presort': True
            }
            Interest_mean_params = {
                'max_depth': 30,
                'max_features': 0.2,
                'min_samples_leaf': 2,
                'presort': True
            }
            Purchase_intent_params = {
                'max_depth': 30,
                'max_features': 0.2,
                'min_samples_leaf': 2,
                'presort': True
            }
            Persuasion_mean_params = {
                'max_depth': 30,
                'max_features': 0.2,
                'min_samples_leaf': 2,
                'presort': True
            }
            Persuasion_likely_params = {
                'max_depth': 30,
                'max_features': 0.2,
                'min_samples_leaf': 2,
                'presort': True
            }
            Interest_frames_params = {
                'max_depth': 30,
                'max_features': 0.2,
                'min_samples_leaf': 2,
                'presort': True
            }

        self.training_params = {
            "Unaided_Branding": Unaided_Branding_params,
            "Brand_Cues__Mean": Brand_mean_cues_params,
            "Aided_Branding__Mean": Aided_Branding__Mean_params,
            "Active_Involvement__Mean": Active_Involvement__Mean_params,
            "New_Information__Mean": New_Information__Mean_params,
            "Enjoyment__Mean": Enjoyment__Mean_params,
            "Brand_Appeal__Mean": Brand_Appeal__Mean_params,
            "Understanding__Mean": Understanding__Mean_params,
            "Relevance_of_Information__Mean":
            Relevance_of_Information__Mean_params,
            "Credibility_of_Information__Mean":
            Credibility_of_Information__Mean_params,
            "Brand_Difference__Mean": Brand_Difference__Mean_params,
            "Interest_peak": Interest_peak_params,
            "Interest_mean_score": Interest_mean_params,
            "Purchase_intent": Purchase_intent_params,
            "Persuasion_mean": Persuasion_mean_params,
            "Persuasion_very_likely": Persuasion_likely_params,
            "Interest_peak_frames": Interest_frames_params
        }
Пример #14
0
import sys

def getOptions(args=sys.argv[1:]):
    parser = argparse.ArgumentParser(description="Parse command")
    parser.add_argument("-s", "--dbserver", help="DB server", required=True)
    parser.add_argument("-d", "--database", help="database id on DB", required=True)
    parser.add_argument("-u", "--username", help="user id", required=True)
    parser.add_argument("-p", "--password", help="user password", required=True)

    options = parser.parse_args(args)
    return options

if __name__ == '__main__':
    options = getOptions(sys.argv[1:])

    sensorDB = SensorDB()
    sensorDB.connect(options.dbserver, options.database, options.username, options.password)

    processing = DataPreprocessing(sensorDB)
    processing.run()

    model = ModelTraining()
    model.load()
    model.train()
    model.save()

    detector = MachineStatusDetector()
    detector.load()
    data = [1, 0, 0, 100.98, 84.19363636363636, 590.1159504132231, 24.29230228721072]
    detector.detect(data)
Пример #15
0
    # orModel.gridSearch(feature_train_df, None)
    print("####  Completed: OrdinalRegression ordridge training ####")
    utility.checkpointTimeTrack()

    #Validation/Test set
    print(
        "####  OrdinalRegression ordridge validating public/private sets ####")
    print("Loading solution")
    soln_filename = '../data/solution.csv'
    print("Completed Loading solution")
    soln_df = pd.read_csv(soln_filename,
                          delimiter=',',
                          low_memory=True,
                          encoding="ISO-8859-1")
    # print(soln_df.info())
    dp = DataPreprocessing()
    # df_a.merge(df_b, on='mukey', how='left')
    test_private_df = dp.getGoldTestSet(
        feature_test_df, soln_df,
        testsetoption='Private')  # ,savepath='../data/test_private_gold.csv')
    test_public_df = dp.getGoldTestSet(
        feature_test_df, soln_df,
        testsetoption='Public')  # savepath='../data/test_public_gold.csv')

    test_private_df = dp.transformLabels(test_private_df)
    test_public_df = dp.transformLabels(test_public_df)

    # test_private_df.drop('id', axis=1, inplace=True)
    # test_private_df.drop('search_term', axis=1, inplace=True)
    # test_private_df.drop('product_uid', axis=1, inplace=True)
    # test_private_df.drop('relevance', axis=1, inplace=True)
Пример #16
0
# -*- coding: utf-8 -*-
"""
Created on Sat Mar 31 19:36:21 2018

@author: shifuddin
"""

from DataPreprocessing import DataPreprocessing
preprocessing = DataPreprocessing('Position_Salaries.csv')

features, outcome = preprocessing.read_data(1, 2, 2)

features_scaled, outcome_scaled = preprocessing.scale_data(features, outcome)

from sklearn.svm import SVR
import numpy as np
regressor = SVR(kernel='rbf')
regressor.fit(features_scaled, outcome_scaled)

predicted_salary_scaled = regressor.predict(
    preprocessing.scale_features(np.array([[6.5]])))

predicted_salary = preprocessing.reverse_outcome(predicted_salary_scaled)

import matplotlib.pyplot as plt
# Visualising the SVR results
plt.scatter(features_scaled, outcome_scaled, color='red')
plt.plot(features_scaled, regressor.predict(features_scaled), color='blue')
plt.title('Truth or Bluff (SVR)')
plt.xlabel('Position level')
plt.ylabel('Salary')
Пример #17
0
        "../data/features_final_20170419.csv")

    columnname = feature_df.columns
    feature_train_df = feature_df

    feature_train_df = feature_df[:74067]
    feature_test_df = feature_df[74067:]

    feature_test_df.pop('relevance')

    soln_filename = '../data/solution.csv'
    soln_df = pd.read_csv(soln_filename,
                          delimiter=',',
                          low_memory=False,
                          encoding="ISO-8859-1")
    dp = DataPreprocessing()
    test_private_df = dp.getGoldTestSet(feature_test_df,
                                        soln_df,
                                        testsetoption='Private')
    test_public_df = dp.getGoldTestSet(feature_test_df,
                                       soln_df,
                                       testsetoption='Public')

    print("####  Running: XGBoostRanker.runXGBoostRanker() ####")
    xgb = XGBoostRanker(feature_train_df)
    xgb.train_Regressor(feature_train_df)

    result_public_df = xgb.test_Model(test_public_df, "Public")
    result_private_df = xgb.test_Model(test_private_df, "Private")

    # gold_df = pd.DataFrame()
Пример #18
0
# In[9]:

from Prediction import Prediction
from TextCleaner import TextCleaner
from DataPreprocessing import DataPreprocessing

# In[19]:

from keras.preprocessing.sequence import pad_sequences

# In[11]:

predictor = Prediction()
cleaner = TextCleaner()
processor = DataPreprocessing()

# In[12]:

loaded_data = processor.load_pickle('TokenizerData')

x_tokenizer, y_tokenizer, x_vocab_size, y_vocab_size, input_word_index, target_word_index, reversed_input_word_index, reversed_target_word_index, max_length_text, max_length_summary = loaded_data[
    0], loaded_data[1], loaded_data[2], loaded_data[3], loaded_data[
        4], loaded_data[5], loaded_data[6], loaded_data[7], loaded_data[
            8], loaded_data[9]

# In[3]:

# Load trained model
encoder_model = predictor.load_model('encoder_model.json',
                                     'encoder_model_weights.h5')
Пример #19
0
def test_DataPreprocessing():
    """Test the limit case of the class DataPreprocessing when all the parameters are given as input"""

    db_path = "./house-prices/house-prices.csv"

    # Nominal parameters:
    nominal = [
        "MS SubClass", "MS Zoning", "Street", "Alley", "Land Contour",
        "Lot Config", "Neighborhood", "Condition 1", "Condition 2",
        "Bldg Type", "House Style", "Roof Style", "Roof Matl", "Exterior 1st",
        "Exterior 2nd", "Mas Vnr Type", "Foundation", "Heating", "Central Air",
        "Garage Type", "Misc Feature", "Sale Type", "Sale Condition"
    ]

    # Continuous parameters:
    continuous = [
        "Lot Frontage", "Lot Area", "Mas Vnr Area", "BsmtFin SF 1",
        "BsmtFin SF 2", "Bsmt Unf SF", "Total Bsmt SF", "1st Flr SF",
        "2nd Flr SF", "Low Qual Fin SF", "Gr Liv Area", "Garage Area",
        "Wood Deck SF", "Open Porch SF", "Enclosed Porch", "3Ssn Porch",
        "Screen Porch", "Pool Area", "Misc Val"
    ]

    # Ordinal parameters:
    ordinal = [
        "Lot Shape", "Utilities", "Land Slope", "Overall Qual", "Overall Cond",
        "Exter Qual", "Exter Cond", "Bsmt Qual", "Bsmt Cond", "Bsmt Exposure",
        "BsmtFin Type 1", "BsmtFin Type 2", "Heating QC", "Electrical",
        "Kitchen Qual", "Functional", "Fireplace Qu", "Garage Finish",
        "Garage Qual", "Garage Cond", "Paved Drive", "Pool QC", "Fence"
    ]

    # Discrete parameters:
    discrete = [
        "Year Built", "Year Remod/Add", "Bsmt Full Bath", "Bsmt Half Bath",
        "Full Bath", "Half Bath", "Bedroom AbvGr", "Kitchen AbvGr",
        "TotRms AbvGrd", "Fireplaces", "Garage Yr Blt", "Garage Cars",
        "Mo Sold", "Yr Sold"
    ]

    protocol = [0.8, 0.1, 0.1]

    house_price_db = HousePricesDatabase(db_path, continuous, discrete,
                                         ordinal, nominal, protocol)
    train_set, cv_set, test_set = house_price_db()

    preprocessing = DataPreprocessing(train_set, cv_set, test_set)
    X, y, mean_sale_price, std_sale_price = preprocessing()

    # Check correct number of samples in each set
    nose.tools.eq_(X[0].shape[0], 1492)
    nose.tools.eq_(X[1].shape[0], 186)
    nose.tools.eq_(X[2].shape[0], 187)
    # Check the correct number of parameters per sample
    nose.tools.eq_(X[0].shape[1], 333)
    nose.tools.eq_(X[1].shape[1], 333)
    nose.tools.eq_(X[2].shape[1], 333)

    # Check correct number of samples in each target set
    nose.tools.eq_(y[0].shape[0], 1492)
    nose.tools.eq_(y[1].shape[0], 186)
    nose.tools.eq_(y[2].shape[0], 187)

    # Verify that the same samples were used.
    np.testing.assert_array_almost_equal(np.sum(X[0]), 68632.00000, decimal=5)
    np.testing.assert_array_almost_equal(np.sum(X[1]), 8477.02924, decimal=5)
    np.testing.assert_array_almost_equal(np.sum(X[2]), 8372.82617, decimal=5)

    np.testing.assert_array_almost_equal(np.sum(y[0]),
                                         1.421085e-13,
                                         decimal=18)
    np.testing.assert_array_almost_equal(np.sum(y[1]), -14.8419728, decimal=7)
    np.testing.assert_array_almost_equal(np.sum(y[2]), -23.7408063, decimal=7)

    # Check that z-normalization of target was done with correct mean and std
    np.testing.assert_array_almost_equal(mean_sale_price,
                                         185637.85120,
                                         decimal=5)
    np.testing.assert_array_almost_equal(std_sale_price,
                                         83643.41726,
                                         decimal=5)
Пример #20
0
# -*- coding: utf-8 -*-
"""
Created on Mon Apr  2 00:44:45 2018

@author: shifuddin
"""

from DataPreprocessing import DataPreprocessing

preprocessing = DataPreprocessing('')
Пример #21
0
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

#from keras.callbacks import ModelCheckpoint

from RecommendationModel import RecommendationModel
from DataPreprocessing import DataPreprocessing

print('script running.............')

dataPreprocessing = DataPreprocessing()

userIds, movieIds, ratings, genres, joined_dataset, movie_csv, rating_csv = dataPreprocessing.load_data(
)

print(len(genres))
rModel = RecommendationModel(dataPreprocessing.max_userId,
                             dataPreprocessing.max_movieId,
                             dataPreprocessing.k_factor)
model = rModel.generate_embeddedModel()

model.load_weights('weights_best_embedded.hdf5')


def predict_rating(userid, movieid):
    return model.predict([np.array([userId]), np.array([movieid])])[0][0]


userId = 50000
user_movies = [[userId, 1234.0, 4.0], [userId, 3421.0, 3.0],
Пример #22
0
from BaselineRecommendations import BaselineRecommendations
from DataPreprocessing import DataPreprocessing
from CollaborativeFiltering import CollaborativeFiltering


if __name__ == '__main__':
    print "Welcome to the Anime Recommender System."

    #***************Data Preprocessing***************
    print "If this is your first time running the program? You'll need to create the necessary matrices and" \
          "remapped rating file if it is."
    initialization = raw_input("Create matrices and remapped rating file if they don't already exist? (yes or no) ")
    while initialization != 'yes' and initialization != 'no':
        initialization = raw_input("Create matrices and remapped rating file if they don't already exist? (yes or no) ")
    if initialization == 'yes':
        preprocess = DataPreprocessing()
        preprocess.run_random_split()
        preprocess.run_arbitrary_split()

    #***************Baseline Recommendation***************
    baseline = raw_input("Do you want to run the baseline recommendation? (yes or no) ")
    while baseline != 'yes' and baseline != 'no':
        baseline = raw_input("Do you want to run the baseline recommendation? (yes or no) ")
    if baseline == 'yes':
        sample_type = raw_input("Do you want to run the randomly sampled data, arbitrarily sampled data or both? (r, a, b) ")
        while sample_type != 'r' and sample_type != 'a' and sample_type != 'b':
            sample_type = raw_input("Do you want to run the randomly sampled data, arbitrarily sampled data or both? (r, a, b) ")
        if sample_type == 'r':
            print "Calculating RMSE for random dataset split."
            baseline_recommend = BaselineRecommendations('random')
            baseline_recommend.run_baseline()
Пример #23
0
# -*- coding: utf-8 -*-
"""
Created on Sun Apr  1 13:06:19 2018

@author: shifuddin
"""

from DataPreprocessing import DataPreprocessing

preprocessing = DataPreprocessing('Position_Salaries.csv')

features, outcome = preprocessing.read_data(1, 2, 2)

from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state=0)
regressor.fit(features, outcome)

predicted_salary = regressor.predict(3)
'''
Visualize regressor as graph
'''
import numpy as np
import matplotlib.pyplot as plt
X_grid = np.arange(min(features), max(features), 0.01)
X_grid = X_grid.reshape((len(X_grid), 1))
plt.scatter(features, outcome, color='red')
plt.plot(X_grid, regressor.predict(X_grid), color='blue')
plt.title('Truth or Bluff (Decision Tree Regression)')
plt.xlabel('Position level')
plt.ylabel('Salary')
plt.show()
#!/usr/bin/env python
# coding: utf-8

# In[1]:

from DataPreprocessing import DataPreprocessing
from TextCleaner import TextCleaner
from tensorflow.keras.models import model_from_json
from attention import AttentionLayer
import numpy as np

# In[2]:

processor = DataPreprocessing()
cleaner = TextCleaner()

data = processor.load_pickle('DataSequences')

x_tr, x_test, x_dev, y_tr, y_test, y_dev = data[0], data[1], data[2], data[
    3], data[4], data[5]

loaded_data = processor.load_pickle('TokenizerData')

x_tokenizer, y_tokenizer, x_vocab_size, y_vocab_size, input_word_index, target_word_index, reversed_input_word_index, reversed_target_word_index, max_length_text, max_length_summary = loaded_data[
    0], loaded_data[1], loaded_data[2], loaded_data[3], loaded_data[
        4], loaded_data[5], loaded_data[6], loaded_data[7], loaded_data[
            8], loaded_data[9]

# In[3]:

Пример #25
0
from flask import json, request, Blueprint, Flask
import os
import ast
import numpy as np
import pandas as pd
import random
import tensorflow as tf
from sklearn.preprocessing import MultiLabelBinarizer
from RecommendationModel import RecommendationModel
from DataPreprocessing import DataPreprocessing

app = Flask(__name__)

#mod = Blueprint('API', __name__)

dataPreprocessing = DataPreprocessing()
movie_csv, link_csv, userIds_csv = dataPreprocessing.load_data()

rModel = RecommendationModel(dataPreprocessing.max_userId,
                             dataPreprocessing.max_movieId,
                             dataPreprocessing.k_factor)
model = rModel.generate_embeddedModel()

model.load_weights('weights_best_embedded.hdf5')

global graph
graph = tf.get_default_graph()


def predict_rating(model, userid, movieid):
    return model.predict([np.array([userid]), np.array([movieid])])[0][0]
Пример #26
0
from DataVisualization import DataVisualization
from DataMining import DataMining

# Configure logger
log = Log.setup_logger("main")

log.info(Constants.INITIAL_MSG)

# Start calculating execution time
start_time = time.time()

log.info(Constants.START_MSG)

# Data Preprocessing Phase
log.info(Constants.DATA_PREPROCESSING_MSG)
dp = DataPreprocessing()
time_series = dp.preprocessing()

# Feature Engineering Phase
log.info(Constants.FEATURE_ENGINEERING_MSG)
fe = FeatureEngineering()

new_time_series = fe.execute_feature_engineering(time_series)

# new_time_series = dp.delete_column(time_series, ['Confirmed Cases', 'Deaths', 'Recovered Cases', 'Active Cases'])

# Truncate zero values from the time series
# new_time_series = dp.truncate_time_series(new_time_series, '26/02/2020')

# Preliminary Analysis: Stationarity Check
pa = PreliminaryAnalysis()
Пример #27
0
# -*- coding: utf-8 -*-
"""
Created on Sat Mar  7 13:30:29 2020

This file is used to test the Active Learning model.

@author: Donovan
"""

from ML_Class import Active_ML_Model
from DataPreprocessing import DataPreprocessing
from sklearn.ensemble import RandomForestClassifier
from SamplingMethods import lowestPercentage
import pandas as pd

preprocess = DataPreprocessing(True)
ml_classifier = RandomForestClassifier()
sampling_method = lowestPercentage

file_name = 'csvOut.csv'
data = pd.read_csv(file_name, index_col=0, header=None)
data = data.iloc[:, :-1]
corn_active_model = Active_ML_Model(data, ml_classifier, preprocess)

stop = False
while stop == False:
    corn_active_model.Continue(sampling_method)
    stop = True
Пример #28
0
# -*- coding: utf-8 -*-
"""
Created on Mon Apr  2 00:44:45 2018

@author: shifuddin
"""

from DataPreprocessing import DataPreprocessing

preprocessing = DataPreprocessing('Mall_Customers.csv')

features, outcome = preprocessing.read_data(2,4,4)

from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
    kmeans.fit(features)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()


# Fitting K-Means to the dataset
kmeans = KMeans(n_clusters = 5, init = 'k-means++', random_state = 42)
y_kmeans = kmeans.fit_predict(features)