Python DataPreprocessing примеры, DataPreprocessing.DataPreprocessing Python примеры использования

Пример #1

0

Показать файл

def runXGBoostRanker():
    print("####  Running: RunMe.runXGBoostRanker() ####")
    reader = HomeDepotReader()
    feature_df = reader.getBasicDataFrame(
        "../data/features_doc2vec_sense2vec_20170416.csv")

    feature_train_df = feature_df[:74067]
    feature_test_df = feature_df[74067:]

    feature_test_df.pop('relevance')

    soln_filename = '../data/solution.csv'
    soln_df = pd.read_csv(soln_filename,
                          delimiter=',',
                          low_memory=False,
                          encoding="ISO-8859-1")
    dp = DataPreprocessing()
    test_private_df = dp.getGoldTestSet(feature_test_df,
                                        soln_df,
                                        testsetoption='Private')
    test_public_df = dp.getGoldTestSet(feature_test_df,
                                       soln_df,
                                       testsetoption='Public')

    xgb = XGBoostRanker(feature_train_df)
    xgb.train_Regressor(feature_train_df)
    # xgb.gridSearch_Regressor(feature_train_df)

    # result_df = xgb.test_Model(test_public_df)
    result_df = xgb.test_Model(test_private_df)

Пример #2

0

Показать файл

class FingerprintAuthentication:
    def __init__(self):
        datadir = '..{0}data{0}images'.format(os.sep)
        resnetmodel_path = '..{0}model{0}resnet_model.h5'.format(os.sep)
        siamesemodel_path = '..{0}model{0}siamese_model.h5'.format(os.sep)

        self.dataPreprocessing = DataPreprocessing(datadir, resnetmodel_path)
        self.fingerprintModel = FingerprintModel(siamesemodel_path,
                                                 resnetmodel_path)
        self.keylist = list(self.dataPreprocessing.fingerprintDatabase.keys())

    def matchFingerprint(self, fingerprint, person_to_match=None):

        # no personid is specified, match with whole DB
        if person_to_match is None:
            siamesefeature_one, siamesefeature_two = self.dataPreprocessing.createTestData(
                fingerprint)
        else:
            template_DB_keylist = []
            siamesefeature_one, siamesefeature_two = self.dataPreprocessing.createTestDataWithPersonID(
                fingerprint, person_to_match, template_DB_keylist)
        match_index, matched_prob = self.fingerprintModel.predict(
            siamesefeature_one, siamesefeature_two)
        DBkeylist = self.keylist if person_to_match is None else template_DB_keylist
        person_id = DBkeylist[match_index].split('aug')[0].split('_')[0]
        matched_tif = DBkeylist[match_index]
        matched_person = self.dataPreprocessing.person_guid_map[person_id]
        return matched_tif, matched_prob, matched_person

Пример #3

0

Показать файл

    def train(self, trainDF, validateDF):
        print("+++++++++++++++++++++Training model...")
        # print("Generating new labels...")
        # dp=DataPreprocessing()
        # trainDF,validateDF=dp.transformLabels(trainDF=trainDF,validationDF=validateDF, newColName=self.yColDiscrete)
        # print("New labels generated...")

        print("Remove non trainable features...")
        self.xTrain = trainDF
        self.yTrain = trainDF[self.yColDiscrete]
        # self.xValidate=validateDF
        # self.yValidate=validateDF[self.yColDiscrete]

        # self.xTrain.drop('search_term', axis=1, inplace=True)
        # self.xTrain.drop('relevance', axis=1, inplace=True)
        if ('relevance_int' in self.xTrain):
            self.xTrain = self.xTrain.drop('relevance_int', axis=1)
        self.xTrain = self.xTrain.replace('inf', 99999)

        self.xTrain.drop('id', axis=1, inplace=True)
        self.xTrain.drop('search_term', axis=1, inplace=True)
        self.xTrain.drop('product_uid', axis=1, inplace=True)
        self.xTrain.drop('relevance', axis=1, inplace=True)
        self.xTrain.drop('product_idx', axis=1, inplace=True)
        self.xTrain.drop('Word2VecQueryExpansion', axis=1, inplace=True)
        self.xTrain.drop('len_search_term', axis=1, inplace=True)
        self.xTrain.drop('len_product_title', axis=1, inplace=True)

        # self.xTrain.drop('product_idx', axis=1, inplace=True)
        # self.xTrain.drop('Word2VecQueryExpansion', axis=1, inplace=True)

        # self.xValidate.drop('search_term', axis=1, inplace=True)
        # self.xValidate.drop('relevance', axis=1, inplace=True)
        # self.xValidate.drop('relevance_int', axis=1, inplace=True)
        # self.xValidate.drop('product_idx', axis=1, inplace=True)
        # self.xValidate.drop('Word2VecQueryExpansion', axis=1, inplace=True)

        print("+++++++++++++++++++++Training in progress")
        # print("self.xTrain:",list(self.xTrain))
        # print("self.yTrain:", list(self.yTrain))
        self.fittedModel = self.model.fit(self.xTrain, self.yTrain)
        self.yPred = self.fittedModel.predict(self.xTrain)
        # print("self.yPred:", self.yPred.shape, self.yPred[1:50, ])

        # print("self.yPred:", list(self.yPred))

        print("Converting to old labels")
        dp = DataPreprocessing()
        self.yTrain = dp.transformNewLabelToOld(self.yTrain.as_matrix())
        self.yPred = dp.transformNewLabelToOld(self.yPred)
        # print("self.yTrain:", self.yTrain.shape,self.yTrain[1:50,])
        # print("self.yPred:", self.yPred.shape, self.yPred[1:50, ])

        print("MSE:", mean_squared_error(self.yTrain, self.yPred))
        print("RMSE:", sqrt(mean_squared_error(self.yTrain, self.yPred)))
        # print("Accuracy:", accuracy_score(self.yTrain, self.yPred))
        # print("Precision:", precision_score(self.yTrain, self.yPred, average='micro'))
        # print("Recall:", recall_score(self.yTrain, self.yPred, average='micro'))
        # print("F1:", f1_score(self.yTrain, self.yPred, average='micro'))
        print("+++++++++++++++++++++Training completed")

Пример #4

0

Показать файл

Файл: Feature_BM25.py Проект: yougth/IRDM2017

    def __convertToCorpus(self, documents):
        """
        Steps to make the documents compatible to gensim
        Changelog
        - 15/3 KS First commit
        :param documents:
        :return:
        """
        #Preprocessing the text
        dp = DataPreprocessing()
        text = dp.getBagOfWords(documentDF=documents,
                                return_type='document_tokens')

        #Create a Gensim text corpus based on documents
        print("Creating a text dictionary")
        self.dictionary = Dictionary(line.lower().split()
                                     for line in documents)
        print(self.dictionary)
        print("Saving text dictionary to file")
        self.dictionary.save('../data.prune/producttext.dict')

        #Create a Gensim document corpus based on text corpus and each document
        print("Creating a Gensim document corpus")
        self.corpus = [self.dictionary.doc2bow(line) for line in text]

        print("Saving corpus to file")
        MmCorpus.serialize('../data.prune/productcorpus.mm', self.corpus)
        self.corpus = MmCorpus('../data.prune/productcorpus.mm')
        print(self.corpus)

Пример #5

0

Показать файл

    def validate(self,
                 testDF,
                 savePredictedFilename='../data/defaultPredictSave.csv'):
        print("+++++++++++++++++++++Validation start")
        print("Remove non trainable features...")

        savePredictedFilename = savePredictedFilename.split("csv")
        savePrediction = testDF['id'].as_matrix()
        print("Saveprediction=", savePrediction)
        self.xTest = testDF
        self.yTest = testDF[self.yColDiscrete]
        if ('relevance_int' in self.xTest):
            self.xTest = self.xTest.drop('relevance_int', axis=1)
        predictedDF = self.xTest

        self.xTest = self.xTest.replace('inf', 99999)
        self.xTest = self.xTest.drop('id', axis=1)
        self.xTest = self.xTest.drop([
            'search_term', 'product_uid', 'relevance', 'product_idx',
            'Word2VecQueryExpansion', 'len_search_term', 'len_product_title'
        ],
                                     axis=1)
        self.yPred = self.fittedModel.predict(self.xTest)
        predictedDF['relevance_int'] = self.yPred

        avgNDCG = NDCG_Eval().computeAvgNDCG(
            testDF, predictedDF, savePredictedFilename[0] + "nDCG" + ".csv")
        # print("avgNDCG:",avgNDCG)
        print("Converting to old labels")
        dp = DataPreprocessing()
        self.yTest = dp.transformNewLabelToOld(self.yTest)
        self.yPred = dp.transformNewLabelToOld(self.yPred)
        # print("self.yTest:", self.yTest.shape,self.yTest[1:50,])
        # print("self.yPred:", self.yPred.shape, self.yPred[1:50, ])

        savePrediction = pd.DataFrame(savePrediction, columns=['id'])
        ypredDF = pd.DataFrame(self.yPred, columns=['pred_relevance'])
        ypredDF.reset_index(drop=True)
        print("savePrediction.size:", savePrediction.size)
        print("savePrediction.size:", savePrediction.shape)
        print("ypredDF.size:", ypredDF.size)
        assert (savePrediction.size == ypredDF.size)
        # predictionResults=pd.concat([savePrediction,ypredDF],axis=1)
        predictionResults = savePrediction.join(ypredDF)
        predictionResults.to_csv(savePredictedFilename[0] + "csv", index=False)
        print("predictionResults.size:", predictionResults.shape)
        print("MSE:", mean_squared_error(self.yTest, self.yPred))
        print("RMSE:", sqrt(mean_squared_error(self.yTest, self.yPred)))
        writeResults = str(savePredictedFilename[0]
                           ) + "RMSE_NDCG_" + ".csv" + "\nRMSE:" + str(
                               sqrt(mean_squared_error(
                                   self.yTest,
                                   self.yPred))) + "\nNDCG:" + str(avgNDCG)
        file = open(savePredictedFilename[0] + "RMSE_NDCG_" + ".csv", 'w')
        file.write(writeResults)
        file.close()
        print("+++++++++++++++++++++Validation end")

Пример #6

0

Показать файл

    def __init__(self):
        datadir = '..{0}data{0}images'.format(os.sep)
        resnetmodel_path = '..{0}model{0}resnet_model.h5'.format(os.sep)
        siamesemodel_path = '..{0}model{0}siamese_model.h5'.format(os.sep)

        self.dataPreprocessing = DataPreprocessing(datadir, resnetmodel_path)
        self.fingerprintModel = FingerprintModel(siamesemodel_path,
                                                 resnetmodel_path)
        self.keylist = list(self.dataPreprocessing.fingerprintDatabase.keys())

Пример #7

0

Показать файл

def flask_button_click():
    # text = get text from flask text box
    proc = DataPreprocessing()
    load_data = proc.load_pickle("TokenizerData")
    predictor = Prediction(load_data)
    encoder_model = predictor.load_model('models\\encoder_model.json',
                                         'models\\encoder_model_weights.h5')
    decoder_model = predictor.load_model('models\\decoder_model.json',
                                         'models\\decoder_model_weights.h5')
    summary = predictor.generated_summaries(text, encoder_model, decoder_model)

Пример #8

0

Показать файл

 def oldLabelScorer(y, y_pred):
     # Custom scoring
     # print("Converting to old labels")
     dp = DataPreprocessing()
     y = dp.transformNewLabelToOld(y.as_matrix())
     y_pred = dp.transformNewLabelToOld(y_pred)
     # print("y:", y.shape, y[1:50, ])
     # print("y_pred:", y_pred.shape, y_pred[1:50, ])
     rmse = sqrt(mean_squared_error(y, y_pred))
     # print("RMSE:", rmse)
     return -rmse

Пример #9

0

Показать файл

def input():
    #  getting text from user

    if request.method=='POST':
        text=request.form.get("text_in")
    # creating summary
        proc = DataPreprocessing()
        load_data = proc.load_pickle("TokenizerData")
        predictor = Prediction(load_data)
        encoder_model = predictor.load_model('models/encoder_model.json', 'models/encoder_model_weights.h5')
        decoder_model = predictor.load_model('models/decoder_model.json', 'models/decoder_model_weights.h5')
        summary = predictor.generated_summaries(text, encoder_model, decoder_model)

    return render_template("summary.html",summary=summary)

Пример #10

0

Показать файл

def main():
    # split data
    training, validation, test = DataPreprocessing().split_data()
    print(type(training))
    df = DataPreprocessing().prepare_data_for_statistics("train")
    # apply word-based features
    WordBasedFeatures(df).features()
    # apply syntactic features
    SyntacticFeatures(df).features()
    SyntacticFeatures(df).outputter()
    # train model
    model = Classifier(df)

    print(model.predict(df))

Пример #11

0

Показать файл

Файл: NNRGR.py Проект: Shifuddin/ML

# -*- coding: utf-8 -*-
"""
Created on Thu Apr 12 13:23:12 2018

@author: shifuddin
"""

from sklearn.neural_network import MLPRegressor
from DataPreprocessing import DataPreprocessing

'''
Create object of preprocessing class
'''
preprocessing = DataPreprocessing('Position_Salaries.csv')

'''
Read features and outcome from csv file
'''
X, Y = preprocessing.read_csv(1,2,2)


'''
transforms and fit features and outcome
'''
X, Y = preprocessing.scale_data(X, Y)

'''
Create classifier from MLPClassifier
'''
regressor = MLPRegressor(hidden_layer_sizes=(100,50))
'''

Пример #12

0

Показать файл

Файл: FeatureEngineering.py Проект: yougth/IRDM2017

 def __createAttributeColumn(self, product_df, attribute_df):
     dp = DataPreprocessing()
     attribute_doc_df = dp.getAttributeDoc(attribute_df)
     # attribute_doc_df
     return product_df.join(attribute_doc_df.set_index('product_uid'), on='product_uid')

Пример #13

0

Показать файл

Файл: Model.py Проект: umeshmeena141/Advertisement-Analysis

    def __init__(self, model='ranf', _type="reg"):
        super().__init__(self, model)
        self.preprocessing = DataPreprocessing()
        # self.frameData = Agg_Frame_Data()
        self.model_type = model
        self.type = _type
        self.parameters = []
        self.all_models = []
        self.output_length = None
        self.output_columns = None
        self.feature_columns = None

        if self.model_type == 'ranf':
            self.model = RandomForestRegressor(warm_start=True,
                                               verbose=1,
                                               random_state=123)
            Unaided_Branding_params = {
                'bootstrap': False,
                'max_depth': 11,
                'max_features': 0.25,
                'min_samples_leaf': 2,
                'n_estimators': 100
            }
            Brand_mean_cues_params = {
                'bootstrap': False,
                'max_depth': 15,
                'max_features': 0.25,
                'min_samples_leaf': 2,
                'n_estimators': 250
            }
            Aided_Branding__Mean_params = {
                'bootstrap': False,
                'max_depth': 15,
                'max_features': 0.3,
                'min_samples_leaf': 2,
                'n_estimators': 100
            }
            Active_Involvement__Mean_params = {
                'bootstrap': False,
                'max_depth': 14,
                'max_features': 0.25,
                'min_samples_leaf': 2,
                'n_estimators': 300
            }
            New_Information__Mean_params = {
                'bootstrap': False,
                'max_depth': 13,
                'max_features': 0.4,
                'min_samples_leaf': 2,
                'n_estimators': 200
            }
            Enjoyment__Mean_params = {
                'bootstrap': False,
                'max_depth': 12,
                'max_features': 0.3,
                'min_samples_leaf': 2,
                'n_estimators': 300
            }
            Brand_Appeal__Mean_params = {
                'bootstrap': False,
                'max_depth': 12,
                'max_features': 0.3,
                'min_samples_leaf': 2,
                'n_estimators': 300
            }
            Understanding__Mean_params = {
                'bootstrap': False,
                'max_depth': 15,
                'max_features': 0.3,
                'min_samples_leaf': 2,
                'n_estimators': 150
            }
            Relevance_of_Information__Mean_params = {
                'bootstrap': False,
                'max_depth': 12,
                'max_features': 0.25,
                'min_samples_leaf': 2,
                'n_estimators': 150
            }
            Credibility_of_Information__Mean_params = {
                'bootstrap': False,
                'max_depth': 11,
                'max_features': 0.25,
                'min_samples_leaf': 2,
                'n_estimators': 100
            }
            Brand_Difference__Mean_params = {
                'bootstrap': False,
                'max_depth': 13,
                'max_features': 0.25,
                'min_samples_leaf': 2,
                'n_estimators': 400
            }
            Interest_peak_params = {
                'bootstrap': False,
                'max_depth': 13,
                'max_features': 0.3,
                'min_samples_leaf': 2,
                'n_estimators': 200
            }
            Interest_mean_params = {
                'bootstrap': False,
                'max_depth': 15,
                'max_features': 0.3,
                'min_samples_leaf': 2,
                'n_estimators': 200
            }
            Purchase_intent_params = {
                'bootstrap': False,
                'max_depth': 15,
                'max_features': 0.3,
                'min_samples_leaf': 2,
                'n_estimators': 400
            }
            Persuasion_mean_params = {
                'bootstrap': False,
                'max_depth': 13,
                'max_features': 0.3,
                'min_samples_leaf': 2,
                'n_estimators': 200
            }
            Persuasion_likely_params = {
                'bootstrap': False,
                'max_depth': 20,
                'max_features': 0.3,
                'min_samples_leaf': 2,
                'n_estimators': 400
            }
            Interest_frames_params = {
                'bootstrap': False,
                'max_depth': 20,
                'max_features': 0.3,
                'min_samples_leaf': 2,
                'n_estimators': 400
            }

        elif self.model_type == 'xgb':
            self.model = xgb.XGBRegressor(eta=0.3,
                                          save_period=1,
                                          random_state=123)
            #### RMSE was decreasing with increaasing n_estimators
            Unaided_Branding_params = {
                'colsample_bytree': 0.4,
                'max_depth': 10,
                'min_samples_leaf': 4,
                'n_estimators': 2000,
                "objective": 'reg:linear',
                "silent": False,
                "alpha": 1,
                "learning_rate": 0.01,
                "reg_lambda": 1
            }
            Brand_mean_cues_params = {
                'colsample_bytree': 0.2,
                'max_depth': 10,
                'min_samples_leaf': 2,
                'n_estimators': 1000,
                "objective": 'reg:linear',
                "silent": False,
                "alpha": 1,
                "learning_rate": 0.01,
                "reg_lambda": 1
            }
            Aided_Branding__Mean_params = {
                'colsample_bytree': 0.2,
                'max_depth': 15,
                'min_samples_leaf': 4,
                'n_estimators': 1000,
                "objective": 'reg:linear',
                "silent": False,
                "alpha": 1,
                "learning_rate": 0.01,
                "reg_lambda": 1
            }
            Active_Involvement__Mean_params = {
                'colsample_bytree': 0.6,
                'max_depth': 10,
                'min_samples_leaf': 2,
                'n_estimators': 900,
                "objective": 'reg:linear',
                "silent": False,
                "alpha": 1,
                "learning_rate": 0.01,
                "reg_lambda": 1
            }
            New_Information__Mean_params = {
                'colsample_bytree': 0.4,
                'max_depth': 10,
                'min_samples_leaf': 2,
                'n_estimators': 1000,
                "objective": 'reg:linear',
                "silent": False,
                "alpha": 10,
                "learning_rate": 0.01,
                "reg_lambda": 1
            }
            Enjoyment__Mean_params = {
                'colsample_bytree': 0.8,
                'max_depth': 10,
                'min_samples_leaf': 2,
                'n_estimators': 1000,
                "objective": 'reg:linear',
                "silent": False,
                "alpha": 1,
                "learning_rate": 0.01,
                "reg_lambda": 1
            }
            Brand_Appeal__Mean_params = {
                'colsample_bytree': 0.6,
                'max_depth': 10,
                'min_samples_leaf': 2,
                'n_estimators': 1000,
                "objective": 'reg:linear',
                "silent": False,
                "alpha": 1,
                "learning_rate": 0.01,
                "reg_lambda": 1
            }
            Understanding__Mean_params = {
                'colsample_bytree': 0.6,
                'max_depth': 10,
                'min_samples_leaf': 2,
                'n_estimators': 1000,
                "objective": 'reg:linear',
                "silent": False,
                "alpha": 1,
                "learning_rate": 0.01,
                "reg_lambda": 1
            }
            Relevance_of_Information__Mean_params = {
                'colsample_bytree': 0.2,
                'max_depth': 10,
                'min_samples_leaf': 2,
                'n_estimators': 1000,
                "objective": 'reg:linear',
                "silent": False,
                "alpha": 1,
                "learning_rate": 0.01,
                "reg_lambda": 1
            }
            Credibility_of_Information__Mean_params = {
                'colsample_bytree': 0.3,
                'max_depth': 10,
                'min_samples_leaf': 2,
                'n_estimators': 1000,
                "objective": 'reg:linear',
                "silent": False,
                "alpha": 1,
                "learning_rate": 0.01,
                "reg_lambda": 1
            }
            Brand_Difference__Mean_params = {
                'colsample_bytree': 0.2,
                'max_depth': 8,
                'min_samples_leaf': 4,
                'n_estimators': 1000,
                "objective": 'reg:linear',
                "silent": False,
                "alpha": 1,
                "learning_rate": 0.01,
                "reg_lambda": 1
            }
            Interest_peak_params = {
                'colsample_bytree': 0.2,
                'max_depth': 10,
                'min_samples_leaf': 4,
                'n_estimators': 1000,
                "objective": 'reg:linear',
                "silent": False,
                "alpha": 1,
                "learning_rate": 0.01,
                "reg_lambda": 1
            }
            Interest_mean_params = {
                'colsample_bytree': 0.2,
                'max_depth': 10,
                'min_samples_leaf': 4,
                'n_estimators': 1000,
                "objective": 'reg:linear',
                "silent": False,
                "alpha": 1,
                "learning_rate": 0.01,
                "reg_lambda": 1
            }
            Purchase_intent_params = {
                'colsample_bytree': 0.2,
                'max_depth': 10,
                'min_samples_leaf': 4,
                'n_estimators': 1000,
                "objective": 'reg:linear',
                "silent": False,
                "alpha": 1,
                "learning_rate": 0.01,
                "reg_lambda": 1
            }
            Persuasion_mean_params = {
                'colsample_bytree': 0.4,
                'max_depth': 10,
                'min_samples_leaf': 4,
                'n_estimators': 1000,
                "objective": 'reg:linear',
                "silent": False,
                "alpha": 1,
                "learning_rate": 0.01,
                "reg_lambda": 1
            }
            Persuasion_likely_params = {
                'colsample_bytree': 0.4,
                'max_depth': 10,
                'min_samples_leaf': 4,
                'n_estimators': 1000,
                "objective": 'reg:linear',
                "silent": False,
                "alpha": 1,
                "learning_rate": 0.01,
                "reg_lambda": 1
            }
            Interest_frames_params = {
                'colsample_bytree': 0.4,
                'max_depth': 10,
                'min_samples_leaf': 4,
                'n_estimators': 1000,
                "objective": 'reg:linear',
                "silent": False,
                "alpha": 1,
                "learning_rate": 0.01,
                "reg_lambda": 1
            }

        elif self.model_type == 'DT':
            self.model = DecisionTreeRegressor(random_state=123)
            Unaided_Branding_params = {
                'splitter': 'best',
                'max_depth': 20,
                'max_features': 0.6,
                'min_samples_leaf': 2,
                'presort': True
            }
            Brand_mean_cues_params = {
                'splitter': 'best',
                'max_depth': 20,
                'max_features': 0.6,
                'min_samples_leaf': 2,
                'presort': True
            }
            Aided_Branding__Mean_params = {
                'splitter': 'best',
                'max_depth': 20,
                'max_features': 0.2,
                'min_samples_leaf': 2,
                'presort': True
            }
            Active_Involvement__Mean_params = {
                'max_depth': 20,
                'max_features': 0.3,
                'min_samples_leaf': 2,
                'presort': True
            }
            New_Information__Mean_params = {
                'max_depth': 20,
                'max_features': 0.2,
                'min_samples_leaf': 2,
                'presort': True
            }
            Enjoyment__Mean_params = {
                'max_depth': 20,
                'max_features': 0.3,
                'min_samples_leaf': 2,
                'presort': True
            }
            Brand_Appeal__Mean_params = {
                'max_depth': 20,
                'max_features': 0.3,
                'min_samples_leaf': 2,
                'presort': True
            }
            Understanding__Mean_params = {
                'max_depth': 30,
                'max_features': 0.3,
                'min_samples_leaf': 2,
                'presort': True
            }
            Relevance_of_Information__Mean_params = {
                'max_depth': 30,
                'max_features': 0.3,
                'min_samples_leaf': 2,
                'presort': True
            }
            Credibility_of_Information__Mean_params = {
                'max_depth': 30,
                'max_features': 0.2,
                'min_samples_leaf': 2,
                'presort': True
            }
            Brand_Difference__Mean_params = {
                'max_depth': 30,
                'max_features': 0.2,
                'min_samples_leaf': 2,
                'presort': True
            }
            Interest_peak_params = {
                'max_depth': 30,
                'max_features': 0.2,
                'min_samples_leaf': 2,
                'presort': True
            }
            Interest_mean_params = {
                'max_depth': 30,
                'max_features': 0.2,
                'min_samples_leaf': 2,
                'presort': True
            }
            Purchase_intent_params = {
                'max_depth': 30,
                'max_features': 0.2,
                'min_samples_leaf': 2,
                'presort': True
            }
            Persuasion_mean_params = {
                'max_depth': 30,
                'max_features': 0.2,
                'min_samples_leaf': 2,
                'presort': True
            }
            Persuasion_likely_params = {
                'max_depth': 30,
                'max_features': 0.2,
                'min_samples_leaf': 2,
                'presort': True
            }
            Interest_frames_params = {
                'max_depth': 30,
                'max_features': 0.2,
                'min_samples_leaf': 2,
                'presort': True
            }

        self.training_params = {
            "Unaided_Branding": Unaided_Branding_params,
            "Brand_Cues__Mean": Brand_mean_cues_params,
            "Aided_Branding__Mean": Aided_Branding__Mean_params,
            "Active_Involvement__Mean": Active_Involvement__Mean_params,
            "New_Information__Mean": New_Information__Mean_params,
            "Enjoyment__Mean": Enjoyment__Mean_params,
            "Brand_Appeal__Mean": Brand_Appeal__Mean_params,
            "Understanding__Mean": Understanding__Mean_params,
            "Relevance_of_Information__Mean":
            Relevance_of_Information__Mean_params,
            "Credibility_of_Information__Mean":
            Credibility_of_Information__Mean_params,
            "Brand_Difference__Mean": Brand_Difference__Mean_params,
            "Interest_peak": Interest_peak_params,
            "Interest_mean_score": Interest_mean_params,
            "Purchase_intent": Purchase_intent_params,
            "Persuasion_mean": Persuasion_mean_params,
            "Persuasion_very_likely": Persuasion_likely_params,
            "Interest_peak_frames": Interest_frames_params
        }

Пример #14

0

Показать файл

import sys

def getOptions(args=sys.argv[1:]):
    parser = argparse.ArgumentParser(description="Parse command")
    parser.add_argument("-s", "--dbserver", help="DB server", required=True)
    parser.add_argument("-d", "--database", help="database id on DB", required=True)
    parser.add_argument("-u", "--username", help="user id", required=True)
    parser.add_argument("-p", "--password", help="user password", required=True)

    options = parser.parse_args(args)
    return options

if __name__ == '__main__':
    options = getOptions(sys.argv[1:])

    sensorDB = SensorDB()
    sensorDB.connect(options.dbserver, options.database, options.username, options.password)

    processing = DataPreprocessing(sensorDB)
    processing.run()

    model = ModelTraining()
    model.load()
    model.train()
    model.save()

    detector = MachineStatusDetector()
    detector.load()
    data = [1, 0, 0, 100.98, 84.19363636363636, 590.1159504132231, 24.29230228721072]
    detector.detect(data)

Пример #15

0

Показать файл

    # orModel.gridSearch(feature_train_df, None)
    print("####  Completed: OrdinalRegression ordridge training ####")
    utility.checkpointTimeTrack()

    #Validation/Test set
    print(
        "####  OrdinalRegression ordridge validating public/private sets ####")
    print("Loading solution")
    soln_filename = '../data/solution.csv'
    print("Completed Loading solution")
    soln_df = pd.read_csv(soln_filename,
                          delimiter=',',
                          low_memory=True,
                          encoding="ISO-8859-1")
    # print(soln_df.info())
    dp = DataPreprocessing()
    # df_a.merge(df_b, on='mukey', how='left')
    test_private_df = dp.getGoldTestSet(
        feature_test_df, soln_df,
        testsetoption='Private')  # ,savepath='../data/test_private_gold.csv')
    test_public_df = dp.getGoldTestSet(
        feature_test_df, soln_df,
        testsetoption='Public')  # savepath='../data/test_public_gold.csv')

    test_private_df = dp.transformLabels(test_private_df)
    test_public_df = dp.transformLabels(test_public_df)

    # test_private_df.drop('id', axis=1, inplace=True)
    # test_private_df.drop('search_term', axis=1, inplace=True)
    # test_private_df.drop('product_uid', axis=1, inplace=True)
    # test_private_df.drop('relevance', axis=1, inplace=True)

Пример #16

0

Показать файл

Файл: SVR.py Проект: Shifuddin/ML

# -*- coding: utf-8 -*-
"""
Created on Sat Mar 31 19:36:21 2018

@author: shifuddin
"""

from DataPreprocessing import DataPreprocessing
preprocessing = DataPreprocessing('Position_Salaries.csv')

features, outcome = preprocessing.read_data(1, 2, 2)

features_scaled, outcome_scaled = preprocessing.scale_data(features, outcome)

from sklearn.svm import SVR
import numpy as np
regressor = SVR(kernel='rbf')
regressor.fit(features_scaled, outcome_scaled)

predicted_salary_scaled = regressor.predict(
    preprocessing.scale_features(np.array([[6.5]])))

predicted_salary = preprocessing.reverse_outcome(predicted_salary_scaled)

import matplotlib.pyplot as plt
# Visualising the SVR results
plt.scatter(features_scaled, outcome_scaled, color='red')
plt.plot(features_scaled, regressor.predict(features_scaled), color='blue')
plt.title('Truth or Bluff (SVR)')
plt.xlabel('Position level')
plt.ylabel('Salary')

Пример #17

0

Показать файл

Файл: XGBoostRanker.py Проект: yougth/IRDM2017

        "../data/features_final_20170419.csv")

    columnname = feature_df.columns
    feature_train_df = feature_df

    feature_train_df = feature_df[:74067]
    feature_test_df = feature_df[74067:]

    feature_test_df.pop('relevance')

    soln_filename = '../data/solution.csv'
    soln_df = pd.read_csv(soln_filename,
                          delimiter=',',
                          low_memory=False,
                          encoding="ISO-8859-1")
    dp = DataPreprocessing()
    test_private_df = dp.getGoldTestSet(feature_test_df,
                                        soln_df,
                                        testsetoption='Private')
    test_public_df = dp.getGoldTestSet(feature_test_df,
                                       soln_df,
                                       testsetoption='Public')

    print("####  Running: XGBoostRanker.runXGBoostRanker() ####")
    xgb = XGBoostRanker(feature_train_df)
    xgb.train_Regressor(feature_train_df)

    result_public_df = xgb.test_Model(test_public_df, "Public")
    result_private_df = xgb.test_Model(test_private_df, "Private")

    # gold_df = pd.DataFrame()

Пример #18

0

Показать файл

# In[9]:

from Prediction import Prediction
from TextCleaner import TextCleaner
from DataPreprocessing import DataPreprocessing

# In[19]:

from keras.preprocessing.sequence import pad_sequences

# In[11]:

predictor = Prediction()
cleaner = TextCleaner()
processor = DataPreprocessing()

# In[12]:

loaded_data = processor.load_pickle('TokenizerData')

x_tokenizer, y_tokenizer, x_vocab_size, y_vocab_size, input_word_index, target_word_index, reversed_input_word_index, reversed_target_word_index, max_length_text, max_length_summary = loaded_data[
    0], loaded_data[1], loaded_data[2], loaded_data[3], loaded_data[
        4], loaded_data[5], loaded_data[6], loaded_data[7], loaded_data[
            8], loaded_data[9]

# In[3]:

# Load trained model
encoder_model = predictor.load_model('encoder_model.json',
                                     'encoder_model_weights.h5')

Пример #19

0

Показать файл

Файл: test_toolchain.py Проект: vanou-dev/mini_project_m05

def test_DataPreprocessing():
    """Test the limit case of the class DataPreprocessing when all the parameters are given as input"""

    db_path = "./house-prices/house-prices.csv"

    # Nominal parameters:
    nominal = [
        "MS SubClass", "MS Zoning", "Street", "Alley", "Land Contour",
        "Lot Config", "Neighborhood", "Condition 1", "Condition 2",
        "Bldg Type", "House Style", "Roof Style", "Roof Matl", "Exterior 1st",
        "Exterior 2nd", "Mas Vnr Type", "Foundation", "Heating", "Central Air",
        "Garage Type", "Misc Feature", "Sale Type", "Sale Condition"
    ]

    # Continuous parameters:
    continuous = [
        "Lot Frontage", "Lot Area", "Mas Vnr Area", "BsmtFin SF 1",
        "BsmtFin SF 2", "Bsmt Unf SF", "Total Bsmt SF", "1st Flr SF",
        "2nd Flr SF", "Low Qual Fin SF", "Gr Liv Area", "Garage Area",
        "Wood Deck SF", "Open Porch SF", "Enclosed Porch", "3Ssn Porch",
        "Screen Porch", "Pool Area", "Misc Val"
    ]

    # Ordinal parameters:
    ordinal = [
        "Lot Shape", "Utilities", "Land Slope", "Overall Qual", "Overall Cond",
        "Exter Qual", "Exter Cond", "Bsmt Qual", "Bsmt Cond", "Bsmt Exposure",
        "BsmtFin Type 1", "BsmtFin Type 2", "Heating QC", "Electrical",
        "Kitchen Qual", "Functional", "Fireplace Qu", "Garage Finish",
        "Garage Qual", "Garage Cond", "Paved Drive", "Pool QC", "Fence"
    ]

    # Discrete parameters:
    discrete = [
        "Year Built", "Year Remod/Add", "Bsmt Full Bath", "Bsmt Half Bath",
        "Full Bath", "Half Bath", "Bedroom AbvGr", "Kitchen AbvGr",
        "TotRms AbvGrd", "Fireplaces", "Garage Yr Blt", "Garage Cars",
        "Mo Sold", "Yr Sold"
    ]

    protocol = [0.8, 0.1, 0.1]

    house_price_db = HousePricesDatabase(db_path, continuous, discrete,
                                         ordinal, nominal, protocol)
    train_set, cv_set, test_set = house_price_db()

    preprocessing = DataPreprocessing(train_set, cv_set, test_set)
    X, y, mean_sale_price, std_sale_price = preprocessing()

    # Check correct number of samples in each set
    nose.tools.eq_(X[0].shape[0], 1492)
    nose.tools.eq_(X[1].shape[0], 186)
    nose.tools.eq_(X[2].shape[0], 187)
    # Check the correct number of parameters per sample
    nose.tools.eq_(X[0].shape[1], 333)
    nose.tools.eq_(X[1].shape[1], 333)
    nose.tools.eq_(X[2].shape[1], 333)

    # Check correct number of samples in each target set
    nose.tools.eq_(y[0].shape[0], 1492)
    nose.tools.eq_(y[1].shape[0], 186)
    nose.tools.eq_(y[2].shape[0], 187)

    # Verify that the same samples were used.
    np.testing.assert_array_almost_equal(np.sum(X[0]), 68632.00000, decimal=5)
    np.testing.assert_array_almost_equal(np.sum(X[1]), 8477.02924, decimal=5)
    np.testing.assert_array_almost_equal(np.sum(X[2]), 8372.82617, decimal=5)

    np.testing.assert_array_almost_equal(np.sum(y[0]),
                                         1.421085e-13,
                                         decimal=18)
    np.testing.assert_array_almost_equal(np.sum(y[1]), -14.8419728, decimal=7)
    np.testing.assert_array_almost_equal(np.sum(y[2]), -23.7408063, decimal=7)

    # Check that z-normalization of target was done with correct mean and std
    np.testing.assert_array_almost_equal(mean_sale_price,
                                         185637.85120,
                                         decimal=5)
    np.testing.assert_array_almost_equal(std_sale_price,
                                         83643.41726,
                                         decimal=5)

Пример #20

0

Показать файл

Файл: kmean.py Проект: Shifuddin/ML

# -*- coding: utf-8 -*-
"""
Created on Mon Apr  2 00:44:45 2018

@author: shifuddin
"""

from DataPreprocessing import DataPreprocessing

preprocessing = DataPreprocessing('')

Пример #21

0

Показать файл

import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

#from keras.callbacks import ModelCheckpoint

from RecommendationModel import RecommendationModel
from DataPreprocessing import DataPreprocessing

print('script running.............')

dataPreprocessing = DataPreprocessing()

userIds, movieIds, ratings, genres, joined_dataset, movie_csv, rating_csv = dataPreprocessing.load_data(
)

print(len(genres))
rModel = RecommendationModel(dataPreprocessing.max_userId,
                             dataPreprocessing.max_movieId,
                             dataPreprocessing.k_factor)
model = rModel.generate_embeddedModel()

model.load_weights('weights_best_embedded.hdf5')


def predict_rating(userid, movieid):
    return model.predict([np.array([userId]), np.array([movieid])])[0][0]


userId = 50000
user_movies = [[userId, 1234.0, 4.0], [userId, 3421.0, 3.0],

Пример #22

0

Показать файл

from BaselineRecommendations import BaselineRecommendations
from DataPreprocessing import DataPreprocessing
from CollaborativeFiltering import CollaborativeFiltering


if __name__ == '__main__':
    print "Welcome to the Anime Recommender System."

    #***************Data Preprocessing***************
    print "If this is your first time running the program? You'll need to create the necessary matrices and" \
          "remapped rating file if it is."
    initialization = raw_input("Create matrices and remapped rating file if they don't already exist? (yes or no) ")
    while initialization != 'yes' and initialization != 'no':
        initialization = raw_input("Create matrices and remapped rating file if they don't already exist? (yes or no) ")
    if initialization == 'yes':
        preprocess = DataPreprocessing()
        preprocess.run_random_split()
        preprocess.run_arbitrary_split()

    #***************Baseline Recommendation***************
    baseline = raw_input("Do you want to run the baseline recommendation? (yes or no) ")
    while baseline != 'yes' and baseline != 'no':
        baseline = raw_input("Do you want to run the baseline recommendation? (yes or no) ")
    if baseline == 'yes':
        sample_type = raw_input("Do you want to run the randomly sampled data, arbitrarily sampled data or both? (r, a, b) ")
        while sample_type != 'r' and sample_type != 'a' and sample_type != 'b':
            sample_type = raw_input("Do you want to run the randomly sampled data, arbitrarily sampled data or both? (r, a, b) ")
        if sample_type == 'r':
            print "Calculating RMSE for random dataset split."
            baseline_recommend = BaselineRecommendations('random')
            baseline_recommend.run_baseline()

Пример #23

0

Показать файл

# -*- coding: utf-8 -*-
"""
Created on Sun Apr  1 13:06:19 2018

@author: shifuddin
"""

from DataPreprocessing import DataPreprocessing

preprocessing = DataPreprocessing('Position_Salaries.csv')

features, outcome = preprocessing.read_data(1, 2, 2)

from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state=0)
regressor.fit(features, outcome)

predicted_salary = regressor.predict(3)
'''
Visualize regressor as graph
'''
import numpy as np
import matplotlib.pyplot as plt
X_grid = np.arange(min(features), max(features), 0.01)
X_grid = X_grid.reshape((len(X_grid), 1))
plt.scatter(features, outcome, color='red')
plt.plot(X_grid, regressor.predict(X_grid), color='blue')
plt.title('Truth or Bluff (Decision Tree Regression)')
plt.xlabel('Position level')
plt.ylabel('Salary')
plt.show()

Пример #24

0

Показать файл

Файл: Prediction.py Проект: umerzia-7001/Text_Summarizer_Project

#!/usr/bin/env python
# coding: utf-8

# In[1]:

from DataPreprocessing import DataPreprocessing
from TextCleaner import TextCleaner
from tensorflow.keras.models import model_from_json
from attention import AttentionLayer
import numpy as np

# In[2]:

processor = DataPreprocessing()
cleaner = TextCleaner()

data = processor.load_pickle('DataSequences')

x_tr, x_test, x_dev, y_tr, y_test, y_dev = data[0], data[1], data[2], data[
    3], data[4], data[5]

loaded_data = processor.load_pickle('TokenizerData')

x_tokenizer, y_tokenizer, x_vocab_size, y_vocab_size, input_word_index, target_word_index, reversed_input_word_index, reversed_target_word_index, max_length_text, max_length_summary = loaded_data[
    0], loaded_data[1], loaded_data[2], loaded_data[3], loaded_data[
        4], loaded_data[5], loaded_data[6], loaded_data[7], loaded_data[
            8], loaded_data[9]

# In[3]:

Пример #25

0

Показать файл

from flask import json, request, Blueprint, Flask
import os
import ast
import numpy as np
import pandas as pd
import random
import tensorflow as tf
from sklearn.preprocessing import MultiLabelBinarizer
from RecommendationModel import RecommendationModel
from DataPreprocessing import DataPreprocessing

app = Flask(__name__)

#mod = Blueprint('API', __name__)

dataPreprocessing = DataPreprocessing()
movie_csv, link_csv, userIds_csv = dataPreprocessing.load_data()

rModel = RecommendationModel(dataPreprocessing.max_userId,
                             dataPreprocessing.max_movieId,
                             dataPreprocessing.k_factor)
model = rModel.generate_embeddedModel()

model.load_weights('weights_best_embedded.hdf5')

global graph
graph = tf.get_default_graph()


def predict_rating(model, userid, movieid):
    return model.predict([np.array([userid]), np.array([movieid])])[0][0]

Пример #26

0

Показать файл

Файл: Main.py Проект: johnmatzakos/predict-covid-19

from DataVisualization import DataVisualization
from DataMining import DataMining

# Configure logger
log = Log.setup_logger("main")

log.info(Constants.INITIAL_MSG)

# Start calculating execution time
start_time = time.time()

log.info(Constants.START_MSG)

# Data Preprocessing Phase
log.info(Constants.DATA_PREPROCESSING_MSG)
dp = DataPreprocessing()
time_series = dp.preprocessing()

# Feature Engineering Phase
log.info(Constants.FEATURE_ENGINEERING_MSG)
fe = FeatureEngineering()

new_time_series = fe.execute_feature_engineering(time_series)

# new_time_series = dp.delete_column(time_series, ['Confirmed Cases', 'Deaths', 'Recovered Cases', 'Active Cases'])

# Truncate zero values from the time series
# new_time_series = dp.truncate_time_series(new_time_series, '26/02/2020')

# Preliminary Analysis: Stationarity Check
pa = PreliminaryAnalysis()

Пример #27

0

Показать файл

Файл: ActiveLearning.py Проект: bknutson77/AgricultureAI_Old

# -*- coding: utf-8 -*-
"""
Created on Sat Mar  7 13:30:29 2020

This file is used to test the Active Learning model.

@author: Donovan
"""

from ML_Class import Active_ML_Model
from DataPreprocessing import DataPreprocessing
from sklearn.ensemble import RandomForestClassifier
from SamplingMethods import lowestPercentage
import pandas as pd

preprocess = DataPreprocessing(True)
ml_classifier = RandomForestClassifier()
sampling_method = lowestPercentage

file_name = 'csvOut.csv'
data = pd.read_csv(file_name, index_col=0, header=None)
data = data.iloc[:, :-1]
corn_active_model = Active_ML_Model(data, ml_classifier, preprocess)

stop = False
while stop == False:
    corn_active_model.Continue(sampling_method)
    stop = True

Пример #28

0

Показать файл

# -*- coding: utf-8 -*-
"""
Created on Mon Apr  2 00:44:45 2018

@author: shifuddin
"""

from DataPreprocessing import DataPreprocessing

preprocessing = DataPreprocessing('Mall_Customers.csv')

features, outcome = preprocessing.read_data(2,4,4)

from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
    kmeans.fit(features)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()


# Fitting K-Means to the dataset
kmeans = KMeans(n_clusters = 5, init = 'k-means++', random_state = 42)
y_kmeans = kmeans.fit_predict(features)

Python DataPreprocessing примеры использования