def runXGBoostRanker(): print("#### Running: RunMe.runXGBoostRanker() ####") reader = HomeDepotReader() feature_df = reader.getBasicDataFrame( "../data/features_doc2vec_sense2vec_20170416.csv") feature_train_df = feature_df[:74067] feature_test_df = feature_df[74067:] feature_test_df.pop('relevance') soln_filename = '../data/solution.csv' soln_df = pd.read_csv(soln_filename, delimiter=',', low_memory=False, encoding="ISO-8859-1") dp = DataPreprocessing() test_private_df = dp.getGoldTestSet(feature_test_df, soln_df, testsetoption='Private') test_public_df = dp.getGoldTestSet(feature_test_df, soln_df, testsetoption='Public') xgb = XGBoostRanker(feature_train_df) xgb.train_Regressor(feature_train_df) # xgb.gridSearch_Regressor(feature_train_df) # result_df = xgb.test_Model(test_public_df) result_df = xgb.test_Model(test_private_df)
class FingerprintAuthentication: def __init__(self): datadir = '..{0}data{0}images'.format(os.sep) resnetmodel_path = '..{0}model{0}resnet_model.h5'.format(os.sep) siamesemodel_path = '..{0}model{0}siamese_model.h5'.format(os.sep) self.dataPreprocessing = DataPreprocessing(datadir, resnetmodel_path) self.fingerprintModel = FingerprintModel(siamesemodel_path, resnetmodel_path) self.keylist = list(self.dataPreprocessing.fingerprintDatabase.keys()) def matchFingerprint(self, fingerprint, person_to_match=None): # no personid is specified, match with whole DB if person_to_match is None: siamesefeature_one, siamesefeature_two = self.dataPreprocessing.createTestData( fingerprint) else: template_DB_keylist = [] siamesefeature_one, siamesefeature_two = self.dataPreprocessing.createTestDataWithPersonID( fingerprint, person_to_match, template_DB_keylist) match_index, matched_prob = self.fingerprintModel.predict( siamesefeature_one, siamesefeature_two) DBkeylist = self.keylist if person_to_match is None else template_DB_keylist person_id = DBkeylist[match_index].split('aug')[0].split('_')[0] matched_tif = DBkeylist[match_index] matched_person = self.dataPreprocessing.person_guid_map[person_id] return matched_tif, matched_prob, matched_person
def train(self, trainDF, validateDF): print("+++++++++++++++++++++Training model...") # print("Generating new labels...") # dp=DataPreprocessing() # trainDF,validateDF=dp.transformLabels(trainDF=trainDF,validationDF=validateDF, newColName=self.yColDiscrete) # print("New labels generated...") print("Remove non trainable features...") self.xTrain = trainDF self.yTrain = trainDF[self.yColDiscrete] # self.xValidate=validateDF # self.yValidate=validateDF[self.yColDiscrete] # self.xTrain.drop('search_term', axis=1, inplace=True) # self.xTrain.drop('relevance', axis=1, inplace=True) if ('relevance_int' in self.xTrain): self.xTrain = self.xTrain.drop('relevance_int', axis=1) self.xTrain = self.xTrain.replace('inf', 99999) self.xTrain.drop('id', axis=1, inplace=True) self.xTrain.drop('search_term', axis=1, inplace=True) self.xTrain.drop('product_uid', axis=1, inplace=True) self.xTrain.drop('relevance', axis=1, inplace=True) self.xTrain.drop('product_idx', axis=1, inplace=True) self.xTrain.drop('Word2VecQueryExpansion', axis=1, inplace=True) self.xTrain.drop('len_search_term', axis=1, inplace=True) self.xTrain.drop('len_product_title', axis=1, inplace=True) # self.xTrain.drop('product_idx', axis=1, inplace=True) # self.xTrain.drop('Word2VecQueryExpansion', axis=1, inplace=True) # self.xValidate.drop('search_term', axis=1, inplace=True) # self.xValidate.drop('relevance', axis=1, inplace=True) # self.xValidate.drop('relevance_int', axis=1, inplace=True) # self.xValidate.drop('product_idx', axis=1, inplace=True) # self.xValidate.drop('Word2VecQueryExpansion', axis=1, inplace=True) print("+++++++++++++++++++++Training in progress") # print("self.xTrain:",list(self.xTrain)) # print("self.yTrain:", list(self.yTrain)) self.fittedModel = self.model.fit(self.xTrain, self.yTrain) self.yPred = self.fittedModel.predict(self.xTrain) # print("self.yPred:", self.yPred.shape, self.yPred[1:50, ]) # print("self.yPred:", list(self.yPred)) print("Converting to old labels") dp = DataPreprocessing() self.yTrain = dp.transformNewLabelToOld(self.yTrain.as_matrix()) self.yPred = dp.transformNewLabelToOld(self.yPred) # print("self.yTrain:", self.yTrain.shape,self.yTrain[1:50,]) # print("self.yPred:", self.yPred.shape, self.yPred[1:50, ]) print("MSE:", mean_squared_error(self.yTrain, self.yPred)) print("RMSE:", sqrt(mean_squared_error(self.yTrain, self.yPred))) # print("Accuracy:", accuracy_score(self.yTrain, self.yPred)) # print("Precision:", precision_score(self.yTrain, self.yPred, average='micro')) # print("Recall:", recall_score(self.yTrain, self.yPred, average='micro')) # print("F1:", f1_score(self.yTrain, self.yPred, average='micro')) print("+++++++++++++++++++++Training completed")
def __convertToCorpus(self, documents): """ Steps to make the documents compatible to gensim Changelog - 15/3 KS First commit :param documents: :return: """ #Preprocessing the text dp = DataPreprocessing() text = dp.getBagOfWords(documentDF=documents, return_type='document_tokens') #Create a Gensim text corpus based on documents print("Creating a text dictionary") self.dictionary = Dictionary(line.lower().split() for line in documents) print(self.dictionary) print("Saving text dictionary to file") self.dictionary.save('../data.prune/producttext.dict') #Create a Gensim document corpus based on text corpus and each document print("Creating a Gensim document corpus") self.corpus = [self.dictionary.doc2bow(line) for line in text] print("Saving corpus to file") MmCorpus.serialize('../data.prune/productcorpus.mm', self.corpus) self.corpus = MmCorpus('../data.prune/productcorpus.mm') print(self.corpus)
def validate(self, testDF, savePredictedFilename='../data/defaultPredictSave.csv'): print("+++++++++++++++++++++Validation start") print("Remove non trainable features...") savePredictedFilename = savePredictedFilename.split("csv") savePrediction = testDF['id'].as_matrix() print("Saveprediction=", savePrediction) self.xTest = testDF self.yTest = testDF[self.yColDiscrete] if ('relevance_int' in self.xTest): self.xTest = self.xTest.drop('relevance_int', axis=1) predictedDF = self.xTest self.xTest = self.xTest.replace('inf', 99999) self.xTest = self.xTest.drop('id', axis=1) self.xTest = self.xTest.drop([ 'search_term', 'product_uid', 'relevance', 'product_idx', 'Word2VecQueryExpansion', 'len_search_term', 'len_product_title' ], axis=1) self.yPred = self.fittedModel.predict(self.xTest) predictedDF['relevance_int'] = self.yPred avgNDCG = NDCG_Eval().computeAvgNDCG( testDF, predictedDF, savePredictedFilename[0] + "nDCG" + ".csv") # print("avgNDCG:",avgNDCG) print("Converting to old labels") dp = DataPreprocessing() self.yTest = dp.transformNewLabelToOld(self.yTest) self.yPred = dp.transformNewLabelToOld(self.yPred) # print("self.yTest:", self.yTest.shape,self.yTest[1:50,]) # print("self.yPred:", self.yPred.shape, self.yPred[1:50, ]) savePrediction = pd.DataFrame(savePrediction, columns=['id']) ypredDF = pd.DataFrame(self.yPred, columns=['pred_relevance']) ypredDF.reset_index(drop=True) print("savePrediction.size:", savePrediction.size) print("savePrediction.size:", savePrediction.shape) print("ypredDF.size:", ypredDF.size) assert (savePrediction.size == ypredDF.size) # predictionResults=pd.concat([savePrediction,ypredDF],axis=1) predictionResults = savePrediction.join(ypredDF) predictionResults.to_csv(savePredictedFilename[0] + "csv", index=False) print("predictionResults.size:", predictionResults.shape) print("MSE:", mean_squared_error(self.yTest, self.yPred)) print("RMSE:", sqrt(mean_squared_error(self.yTest, self.yPred))) writeResults = str(savePredictedFilename[0] ) + "RMSE_NDCG_" + ".csv" + "\nRMSE:" + str( sqrt(mean_squared_error( self.yTest, self.yPred))) + "\nNDCG:" + str(avgNDCG) file = open(savePredictedFilename[0] + "RMSE_NDCG_" + ".csv", 'w') file.write(writeResults) file.close() print("+++++++++++++++++++++Validation end")
def __init__(self): datadir = '..{0}data{0}images'.format(os.sep) resnetmodel_path = '..{0}model{0}resnet_model.h5'.format(os.sep) siamesemodel_path = '..{0}model{0}siamese_model.h5'.format(os.sep) self.dataPreprocessing = DataPreprocessing(datadir, resnetmodel_path) self.fingerprintModel = FingerprintModel(siamesemodel_path, resnetmodel_path) self.keylist = list(self.dataPreprocessing.fingerprintDatabase.keys())
def flask_button_click(): # text = get text from flask text box proc = DataPreprocessing() load_data = proc.load_pickle("TokenizerData") predictor = Prediction(load_data) encoder_model = predictor.load_model('models\\encoder_model.json', 'models\\encoder_model_weights.h5') decoder_model = predictor.load_model('models\\decoder_model.json', 'models\\decoder_model_weights.h5') summary = predictor.generated_summaries(text, encoder_model, decoder_model)
def oldLabelScorer(y, y_pred): # Custom scoring # print("Converting to old labels") dp = DataPreprocessing() y = dp.transformNewLabelToOld(y.as_matrix()) y_pred = dp.transformNewLabelToOld(y_pred) # print("y:", y.shape, y[1:50, ]) # print("y_pred:", y_pred.shape, y_pred[1:50, ]) rmse = sqrt(mean_squared_error(y, y_pred)) # print("RMSE:", rmse) return -rmse
def input(): # getting text from user if request.method=='POST': text=request.form.get("text_in") # creating summary proc = DataPreprocessing() load_data = proc.load_pickle("TokenizerData") predictor = Prediction(load_data) encoder_model = predictor.load_model('models/encoder_model.json', 'models/encoder_model_weights.h5') decoder_model = predictor.load_model('models/decoder_model.json', 'models/decoder_model_weights.h5') summary = predictor.generated_summaries(text, encoder_model, decoder_model) return render_template("summary.html",summary=summary)
def main(): # split data training, validation, test = DataPreprocessing().split_data() print(type(training)) df = DataPreprocessing().prepare_data_for_statistics("train") # apply word-based features WordBasedFeatures(df).features() # apply syntactic features SyntacticFeatures(df).features() SyntacticFeatures(df).outputter() # train model model = Classifier(df) print(model.predict(df))
# -*- coding: utf-8 -*- """ Created on Thu Apr 12 13:23:12 2018 @author: shifuddin """ from sklearn.neural_network import MLPRegressor from DataPreprocessing import DataPreprocessing ''' Create object of preprocessing class ''' preprocessing = DataPreprocessing('Position_Salaries.csv') ''' Read features and outcome from csv file ''' X, Y = preprocessing.read_csv(1,2,2) ''' transforms and fit features and outcome ''' X, Y = preprocessing.scale_data(X, Y) ''' Create classifier from MLPClassifier ''' regressor = MLPRegressor(hidden_layer_sizes=(100,50)) '''
def __createAttributeColumn(self, product_df, attribute_df): dp = DataPreprocessing() attribute_doc_df = dp.getAttributeDoc(attribute_df) # attribute_doc_df return product_df.join(attribute_doc_df.set_index('product_uid'), on='product_uid')
def __init__(self, model='ranf', _type="reg"): super().__init__(self, model) self.preprocessing = DataPreprocessing() # self.frameData = Agg_Frame_Data() self.model_type = model self.type = _type self.parameters = [] self.all_models = [] self.output_length = None self.output_columns = None self.feature_columns = None if self.model_type == 'ranf': self.model = RandomForestRegressor(warm_start=True, verbose=1, random_state=123) Unaided_Branding_params = { 'bootstrap': False, 'max_depth': 11, 'max_features': 0.25, 'min_samples_leaf': 2, 'n_estimators': 100 } Brand_mean_cues_params = { 'bootstrap': False, 'max_depth': 15, 'max_features': 0.25, 'min_samples_leaf': 2, 'n_estimators': 250 } Aided_Branding__Mean_params = { 'bootstrap': False, 'max_depth': 15, 'max_features': 0.3, 'min_samples_leaf': 2, 'n_estimators': 100 } Active_Involvement__Mean_params = { 'bootstrap': False, 'max_depth': 14, 'max_features': 0.25, 'min_samples_leaf': 2, 'n_estimators': 300 } New_Information__Mean_params = { 'bootstrap': False, 'max_depth': 13, 'max_features': 0.4, 'min_samples_leaf': 2, 'n_estimators': 200 } Enjoyment__Mean_params = { 'bootstrap': False, 'max_depth': 12, 'max_features': 0.3, 'min_samples_leaf': 2, 'n_estimators': 300 } Brand_Appeal__Mean_params = { 'bootstrap': False, 'max_depth': 12, 'max_features': 0.3, 'min_samples_leaf': 2, 'n_estimators': 300 } Understanding__Mean_params = { 'bootstrap': False, 'max_depth': 15, 'max_features': 0.3, 'min_samples_leaf': 2, 'n_estimators': 150 } Relevance_of_Information__Mean_params = { 'bootstrap': False, 'max_depth': 12, 'max_features': 0.25, 'min_samples_leaf': 2, 'n_estimators': 150 } Credibility_of_Information__Mean_params = { 'bootstrap': False, 'max_depth': 11, 'max_features': 0.25, 'min_samples_leaf': 2, 'n_estimators': 100 } Brand_Difference__Mean_params = { 'bootstrap': False, 'max_depth': 13, 'max_features': 0.25, 'min_samples_leaf': 2, 'n_estimators': 400 } Interest_peak_params = { 'bootstrap': False, 'max_depth': 13, 'max_features': 0.3, 'min_samples_leaf': 2, 'n_estimators': 200 } Interest_mean_params = { 'bootstrap': False, 'max_depth': 15, 'max_features': 0.3, 'min_samples_leaf': 2, 'n_estimators': 200 } Purchase_intent_params = { 'bootstrap': False, 'max_depth': 15, 'max_features': 0.3, 'min_samples_leaf': 2, 'n_estimators': 400 } Persuasion_mean_params = { 'bootstrap': False, 'max_depth': 13, 'max_features': 0.3, 'min_samples_leaf': 2, 'n_estimators': 200 } Persuasion_likely_params = { 'bootstrap': False, 'max_depth': 20, 'max_features': 0.3, 'min_samples_leaf': 2, 'n_estimators': 400 } Interest_frames_params = { 'bootstrap': False, 'max_depth': 20, 'max_features': 0.3, 'min_samples_leaf': 2, 'n_estimators': 400 } elif self.model_type == 'xgb': self.model = xgb.XGBRegressor(eta=0.3, save_period=1, random_state=123) #### RMSE was decreasing with increaasing n_estimators Unaided_Branding_params = { 'colsample_bytree': 0.4, 'max_depth': 10, 'min_samples_leaf': 4, 'n_estimators': 2000, "objective": 'reg:linear', "silent": False, "alpha": 1, "learning_rate": 0.01, "reg_lambda": 1 } Brand_mean_cues_params = { 'colsample_bytree': 0.2, 'max_depth': 10, 'min_samples_leaf': 2, 'n_estimators': 1000, "objective": 'reg:linear', "silent": False, "alpha": 1, "learning_rate": 0.01, "reg_lambda": 1 } Aided_Branding__Mean_params = { 'colsample_bytree': 0.2, 'max_depth': 15, 'min_samples_leaf': 4, 'n_estimators': 1000, "objective": 'reg:linear', "silent": False, "alpha": 1, "learning_rate": 0.01, "reg_lambda": 1 } Active_Involvement__Mean_params = { 'colsample_bytree': 0.6, 'max_depth': 10, 'min_samples_leaf': 2, 'n_estimators': 900, "objective": 'reg:linear', "silent": False, "alpha": 1, "learning_rate": 0.01, "reg_lambda": 1 } New_Information__Mean_params = { 'colsample_bytree': 0.4, 'max_depth': 10, 'min_samples_leaf': 2, 'n_estimators': 1000, "objective": 'reg:linear', "silent": False, "alpha": 10, "learning_rate": 0.01, "reg_lambda": 1 } Enjoyment__Mean_params = { 'colsample_bytree': 0.8, 'max_depth': 10, 'min_samples_leaf': 2, 'n_estimators': 1000, "objective": 'reg:linear', "silent": False, "alpha": 1, "learning_rate": 0.01, "reg_lambda": 1 } Brand_Appeal__Mean_params = { 'colsample_bytree': 0.6, 'max_depth': 10, 'min_samples_leaf': 2, 'n_estimators': 1000, "objective": 'reg:linear', "silent": False, "alpha": 1, "learning_rate": 0.01, "reg_lambda": 1 } Understanding__Mean_params = { 'colsample_bytree': 0.6, 'max_depth': 10, 'min_samples_leaf': 2, 'n_estimators': 1000, "objective": 'reg:linear', "silent": False, "alpha": 1, "learning_rate": 0.01, "reg_lambda": 1 } Relevance_of_Information__Mean_params = { 'colsample_bytree': 0.2, 'max_depth': 10, 'min_samples_leaf': 2, 'n_estimators': 1000, "objective": 'reg:linear', "silent": False, "alpha": 1, "learning_rate": 0.01, "reg_lambda": 1 } Credibility_of_Information__Mean_params = { 'colsample_bytree': 0.3, 'max_depth': 10, 'min_samples_leaf': 2, 'n_estimators': 1000, "objective": 'reg:linear', "silent": False, "alpha": 1, "learning_rate": 0.01, "reg_lambda": 1 } Brand_Difference__Mean_params = { 'colsample_bytree': 0.2, 'max_depth': 8, 'min_samples_leaf': 4, 'n_estimators': 1000, "objective": 'reg:linear', "silent": False, "alpha": 1, "learning_rate": 0.01, "reg_lambda": 1 } Interest_peak_params = { 'colsample_bytree': 0.2, 'max_depth': 10, 'min_samples_leaf': 4, 'n_estimators': 1000, "objective": 'reg:linear', "silent": False, "alpha": 1, "learning_rate": 0.01, "reg_lambda": 1 } Interest_mean_params = { 'colsample_bytree': 0.2, 'max_depth': 10, 'min_samples_leaf': 4, 'n_estimators': 1000, "objective": 'reg:linear', "silent": False, "alpha": 1, "learning_rate": 0.01, "reg_lambda": 1 } Purchase_intent_params = { 'colsample_bytree': 0.2, 'max_depth': 10, 'min_samples_leaf': 4, 'n_estimators': 1000, "objective": 'reg:linear', "silent": False, "alpha": 1, "learning_rate": 0.01, "reg_lambda": 1 } Persuasion_mean_params = { 'colsample_bytree': 0.4, 'max_depth': 10, 'min_samples_leaf': 4, 'n_estimators': 1000, "objective": 'reg:linear', "silent": False, "alpha": 1, "learning_rate": 0.01, "reg_lambda": 1 } Persuasion_likely_params = { 'colsample_bytree': 0.4, 'max_depth': 10, 'min_samples_leaf': 4, 'n_estimators': 1000, "objective": 'reg:linear', "silent": False, "alpha": 1, "learning_rate": 0.01, "reg_lambda": 1 } Interest_frames_params = { 'colsample_bytree': 0.4, 'max_depth': 10, 'min_samples_leaf': 4, 'n_estimators': 1000, "objective": 'reg:linear', "silent": False, "alpha": 1, "learning_rate": 0.01, "reg_lambda": 1 } elif self.model_type == 'DT': self.model = DecisionTreeRegressor(random_state=123) Unaided_Branding_params = { 'splitter': 'best', 'max_depth': 20, 'max_features': 0.6, 'min_samples_leaf': 2, 'presort': True } Brand_mean_cues_params = { 'splitter': 'best', 'max_depth': 20, 'max_features': 0.6, 'min_samples_leaf': 2, 'presort': True } Aided_Branding__Mean_params = { 'splitter': 'best', 'max_depth': 20, 'max_features': 0.2, 'min_samples_leaf': 2, 'presort': True } Active_Involvement__Mean_params = { 'max_depth': 20, 'max_features': 0.3, 'min_samples_leaf': 2, 'presort': True } New_Information__Mean_params = { 'max_depth': 20, 'max_features': 0.2, 'min_samples_leaf': 2, 'presort': True } Enjoyment__Mean_params = { 'max_depth': 20, 'max_features': 0.3, 'min_samples_leaf': 2, 'presort': True } Brand_Appeal__Mean_params = { 'max_depth': 20, 'max_features': 0.3, 'min_samples_leaf': 2, 'presort': True } Understanding__Mean_params = { 'max_depth': 30, 'max_features': 0.3, 'min_samples_leaf': 2, 'presort': True } Relevance_of_Information__Mean_params = { 'max_depth': 30, 'max_features': 0.3, 'min_samples_leaf': 2, 'presort': True } Credibility_of_Information__Mean_params = { 'max_depth': 30, 'max_features': 0.2, 'min_samples_leaf': 2, 'presort': True } Brand_Difference__Mean_params = { 'max_depth': 30, 'max_features': 0.2, 'min_samples_leaf': 2, 'presort': True } Interest_peak_params = { 'max_depth': 30, 'max_features': 0.2, 'min_samples_leaf': 2, 'presort': True } Interest_mean_params = { 'max_depth': 30, 'max_features': 0.2, 'min_samples_leaf': 2, 'presort': True } Purchase_intent_params = { 'max_depth': 30, 'max_features': 0.2, 'min_samples_leaf': 2, 'presort': True } Persuasion_mean_params = { 'max_depth': 30, 'max_features': 0.2, 'min_samples_leaf': 2, 'presort': True } Persuasion_likely_params = { 'max_depth': 30, 'max_features': 0.2, 'min_samples_leaf': 2, 'presort': True } Interest_frames_params = { 'max_depth': 30, 'max_features': 0.2, 'min_samples_leaf': 2, 'presort': True } self.training_params = { "Unaided_Branding": Unaided_Branding_params, "Brand_Cues__Mean": Brand_mean_cues_params, "Aided_Branding__Mean": Aided_Branding__Mean_params, "Active_Involvement__Mean": Active_Involvement__Mean_params, "New_Information__Mean": New_Information__Mean_params, "Enjoyment__Mean": Enjoyment__Mean_params, "Brand_Appeal__Mean": Brand_Appeal__Mean_params, "Understanding__Mean": Understanding__Mean_params, "Relevance_of_Information__Mean": Relevance_of_Information__Mean_params, "Credibility_of_Information__Mean": Credibility_of_Information__Mean_params, "Brand_Difference__Mean": Brand_Difference__Mean_params, "Interest_peak": Interest_peak_params, "Interest_mean_score": Interest_mean_params, "Purchase_intent": Purchase_intent_params, "Persuasion_mean": Persuasion_mean_params, "Persuasion_very_likely": Persuasion_likely_params, "Interest_peak_frames": Interest_frames_params }
import sys def getOptions(args=sys.argv[1:]): parser = argparse.ArgumentParser(description="Parse command") parser.add_argument("-s", "--dbserver", help="DB server", required=True) parser.add_argument("-d", "--database", help="database id on DB", required=True) parser.add_argument("-u", "--username", help="user id", required=True) parser.add_argument("-p", "--password", help="user password", required=True) options = parser.parse_args(args) return options if __name__ == '__main__': options = getOptions(sys.argv[1:]) sensorDB = SensorDB() sensorDB.connect(options.dbserver, options.database, options.username, options.password) processing = DataPreprocessing(sensorDB) processing.run() model = ModelTraining() model.load() model.train() model.save() detector = MachineStatusDetector() detector.load() data = [1, 0, 0, 100.98, 84.19363636363636, 590.1159504132231, 24.29230228721072] detector.detect(data)
# orModel.gridSearch(feature_train_df, None) print("#### Completed: OrdinalRegression ordridge training ####") utility.checkpointTimeTrack() #Validation/Test set print( "#### OrdinalRegression ordridge validating public/private sets ####") print("Loading solution") soln_filename = '../data/solution.csv' print("Completed Loading solution") soln_df = pd.read_csv(soln_filename, delimiter=',', low_memory=True, encoding="ISO-8859-1") # print(soln_df.info()) dp = DataPreprocessing() # df_a.merge(df_b, on='mukey', how='left') test_private_df = dp.getGoldTestSet( feature_test_df, soln_df, testsetoption='Private') # ,savepath='../data/test_private_gold.csv') test_public_df = dp.getGoldTestSet( feature_test_df, soln_df, testsetoption='Public') # savepath='../data/test_public_gold.csv') test_private_df = dp.transformLabels(test_private_df) test_public_df = dp.transformLabels(test_public_df) # test_private_df.drop('id', axis=1, inplace=True) # test_private_df.drop('search_term', axis=1, inplace=True) # test_private_df.drop('product_uid', axis=1, inplace=True) # test_private_df.drop('relevance', axis=1, inplace=True)
# -*- coding: utf-8 -*- """ Created on Sat Mar 31 19:36:21 2018 @author: shifuddin """ from DataPreprocessing import DataPreprocessing preprocessing = DataPreprocessing('Position_Salaries.csv') features, outcome = preprocessing.read_data(1, 2, 2) features_scaled, outcome_scaled = preprocessing.scale_data(features, outcome) from sklearn.svm import SVR import numpy as np regressor = SVR(kernel='rbf') regressor.fit(features_scaled, outcome_scaled) predicted_salary_scaled = regressor.predict( preprocessing.scale_features(np.array([[6.5]]))) predicted_salary = preprocessing.reverse_outcome(predicted_salary_scaled) import matplotlib.pyplot as plt # Visualising the SVR results plt.scatter(features_scaled, outcome_scaled, color='red') plt.plot(features_scaled, regressor.predict(features_scaled), color='blue') plt.title('Truth or Bluff (SVR)') plt.xlabel('Position level') plt.ylabel('Salary')
"../data/features_final_20170419.csv") columnname = feature_df.columns feature_train_df = feature_df feature_train_df = feature_df[:74067] feature_test_df = feature_df[74067:] feature_test_df.pop('relevance') soln_filename = '../data/solution.csv' soln_df = pd.read_csv(soln_filename, delimiter=',', low_memory=False, encoding="ISO-8859-1") dp = DataPreprocessing() test_private_df = dp.getGoldTestSet(feature_test_df, soln_df, testsetoption='Private') test_public_df = dp.getGoldTestSet(feature_test_df, soln_df, testsetoption='Public') print("#### Running: XGBoostRanker.runXGBoostRanker() ####") xgb = XGBoostRanker(feature_train_df) xgb.train_Regressor(feature_train_df) result_public_df = xgb.test_Model(test_public_df, "Public") result_private_df = xgb.test_Model(test_private_df, "Private") # gold_df = pd.DataFrame()
# In[9]: from Prediction import Prediction from TextCleaner import TextCleaner from DataPreprocessing import DataPreprocessing # In[19]: from keras.preprocessing.sequence import pad_sequences # In[11]: predictor = Prediction() cleaner = TextCleaner() processor = DataPreprocessing() # In[12]: loaded_data = processor.load_pickle('TokenizerData') x_tokenizer, y_tokenizer, x_vocab_size, y_vocab_size, input_word_index, target_word_index, reversed_input_word_index, reversed_target_word_index, max_length_text, max_length_summary = loaded_data[ 0], loaded_data[1], loaded_data[2], loaded_data[3], loaded_data[ 4], loaded_data[5], loaded_data[6], loaded_data[7], loaded_data[ 8], loaded_data[9] # In[3]: # Load trained model encoder_model = predictor.load_model('encoder_model.json', 'encoder_model_weights.h5')
def test_DataPreprocessing(): """Test the limit case of the class DataPreprocessing when all the parameters are given as input""" db_path = "./house-prices/house-prices.csv" # Nominal parameters: nominal = [ "MS SubClass", "MS Zoning", "Street", "Alley", "Land Contour", "Lot Config", "Neighborhood", "Condition 1", "Condition 2", "Bldg Type", "House Style", "Roof Style", "Roof Matl", "Exterior 1st", "Exterior 2nd", "Mas Vnr Type", "Foundation", "Heating", "Central Air", "Garage Type", "Misc Feature", "Sale Type", "Sale Condition" ] # Continuous parameters: continuous = [ "Lot Frontage", "Lot Area", "Mas Vnr Area", "BsmtFin SF 1", "BsmtFin SF 2", "Bsmt Unf SF", "Total Bsmt SF", "1st Flr SF", "2nd Flr SF", "Low Qual Fin SF", "Gr Liv Area", "Garage Area", "Wood Deck SF", "Open Porch SF", "Enclosed Porch", "3Ssn Porch", "Screen Porch", "Pool Area", "Misc Val" ] # Ordinal parameters: ordinal = [ "Lot Shape", "Utilities", "Land Slope", "Overall Qual", "Overall Cond", "Exter Qual", "Exter Cond", "Bsmt Qual", "Bsmt Cond", "Bsmt Exposure", "BsmtFin Type 1", "BsmtFin Type 2", "Heating QC", "Electrical", "Kitchen Qual", "Functional", "Fireplace Qu", "Garage Finish", "Garage Qual", "Garage Cond", "Paved Drive", "Pool QC", "Fence" ] # Discrete parameters: discrete = [ "Year Built", "Year Remod/Add", "Bsmt Full Bath", "Bsmt Half Bath", "Full Bath", "Half Bath", "Bedroom AbvGr", "Kitchen AbvGr", "TotRms AbvGrd", "Fireplaces", "Garage Yr Blt", "Garage Cars", "Mo Sold", "Yr Sold" ] protocol = [0.8, 0.1, 0.1] house_price_db = HousePricesDatabase(db_path, continuous, discrete, ordinal, nominal, protocol) train_set, cv_set, test_set = house_price_db() preprocessing = DataPreprocessing(train_set, cv_set, test_set) X, y, mean_sale_price, std_sale_price = preprocessing() # Check correct number of samples in each set nose.tools.eq_(X[0].shape[0], 1492) nose.tools.eq_(X[1].shape[0], 186) nose.tools.eq_(X[2].shape[0], 187) # Check the correct number of parameters per sample nose.tools.eq_(X[0].shape[1], 333) nose.tools.eq_(X[1].shape[1], 333) nose.tools.eq_(X[2].shape[1], 333) # Check correct number of samples in each target set nose.tools.eq_(y[0].shape[0], 1492) nose.tools.eq_(y[1].shape[0], 186) nose.tools.eq_(y[2].shape[0], 187) # Verify that the same samples were used. np.testing.assert_array_almost_equal(np.sum(X[0]), 68632.00000, decimal=5) np.testing.assert_array_almost_equal(np.sum(X[1]), 8477.02924, decimal=5) np.testing.assert_array_almost_equal(np.sum(X[2]), 8372.82617, decimal=5) np.testing.assert_array_almost_equal(np.sum(y[0]), 1.421085e-13, decimal=18) np.testing.assert_array_almost_equal(np.sum(y[1]), -14.8419728, decimal=7) np.testing.assert_array_almost_equal(np.sum(y[2]), -23.7408063, decimal=7) # Check that z-normalization of target was done with correct mean and std np.testing.assert_array_almost_equal(mean_sale_price, 185637.85120, decimal=5) np.testing.assert_array_almost_equal(std_sale_price, 83643.41726, decimal=5)
# -*- coding: utf-8 -*- """ Created on Mon Apr 2 00:44:45 2018 @author: shifuddin """ from DataPreprocessing import DataPreprocessing preprocessing = DataPreprocessing('')
import numpy as np import pandas as pd from sklearn.preprocessing import MultiLabelBinarizer #from keras.callbacks import ModelCheckpoint from RecommendationModel import RecommendationModel from DataPreprocessing import DataPreprocessing print('script running.............') dataPreprocessing = DataPreprocessing() userIds, movieIds, ratings, genres, joined_dataset, movie_csv, rating_csv = dataPreprocessing.load_data( ) print(len(genres)) rModel = RecommendationModel(dataPreprocessing.max_userId, dataPreprocessing.max_movieId, dataPreprocessing.k_factor) model = rModel.generate_embeddedModel() model.load_weights('weights_best_embedded.hdf5') def predict_rating(userid, movieid): return model.predict([np.array([userId]), np.array([movieid])])[0][0] userId = 50000 user_movies = [[userId, 1234.0, 4.0], [userId, 3421.0, 3.0],
from BaselineRecommendations import BaselineRecommendations from DataPreprocessing import DataPreprocessing from CollaborativeFiltering import CollaborativeFiltering if __name__ == '__main__': print "Welcome to the Anime Recommender System." #***************Data Preprocessing*************** print "If this is your first time running the program? You'll need to create the necessary matrices and" \ "remapped rating file if it is." initialization = raw_input("Create matrices and remapped rating file if they don't already exist? (yes or no) ") while initialization != 'yes' and initialization != 'no': initialization = raw_input("Create matrices and remapped rating file if they don't already exist? (yes or no) ") if initialization == 'yes': preprocess = DataPreprocessing() preprocess.run_random_split() preprocess.run_arbitrary_split() #***************Baseline Recommendation*************** baseline = raw_input("Do you want to run the baseline recommendation? (yes or no) ") while baseline != 'yes' and baseline != 'no': baseline = raw_input("Do you want to run the baseline recommendation? (yes or no) ") if baseline == 'yes': sample_type = raw_input("Do you want to run the randomly sampled data, arbitrarily sampled data or both? (r, a, b) ") while sample_type != 'r' and sample_type != 'a' and sample_type != 'b': sample_type = raw_input("Do you want to run the randomly sampled data, arbitrarily sampled data or both? (r, a, b) ") if sample_type == 'r': print "Calculating RMSE for random dataset split." baseline_recommend = BaselineRecommendations('random') baseline_recommend.run_baseline()
# -*- coding: utf-8 -*- """ Created on Sun Apr 1 13:06:19 2018 @author: shifuddin """ from DataPreprocessing import DataPreprocessing preprocessing = DataPreprocessing('Position_Salaries.csv') features, outcome = preprocessing.read_data(1, 2, 2) from sklearn.tree import DecisionTreeRegressor regressor = DecisionTreeRegressor(random_state=0) regressor.fit(features, outcome) predicted_salary = regressor.predict(3) ''' Visualize regressor as graph ''' import numpy as np import matplotlib.pyplot as plt X_grid = np.arange(min(features), max(features), 0.01) X_grid = X_grid.reshape((len(X_grid), 1)) plt.scatter(features, outcome, color='red') plt.plot(X_grid, regressor.predict(X_grid), color='blue') plt.title('Truth or Bluff (Decision Tree Regression)') plt.xlabel('Position level') plt.ylabel('Salary') plt.show()
#!/usr/bin/env python # coding: utf-8 # In[1]: from DataPreprocessing import DataPreprocessing from TextCleaner import TextCleaner from tensorflow.keras.models import model_from_json from attention import AttentionLayer import numpy as np # In[2]: processor = DataPreprocessing() cleaner = TextCleaner() data = processor.load_pickle('DataSequences') x_tr, x_test, x_dev, y_tr, y_test, y_dev = data[0], data[1], data[2], data[ 3], data[4], data[5] loaded_data = processor.load_pickle('TokenizerData') x_tokenizer, y_tokenizer, x_vocab_size, y_vocab_size, input_word_index, target_word_index, reversed_input_word_index, reversed_target_word_index, max_length_text, max_length_summary = loaded_data[ 0], loaded_data[1], loaded_data[2], loaded_data[3], loaded_data[ 4], loaded_data[5], loaded_data[6], loaded_data[7], loaded_data[ 8], loaded_data[9] # In[3]:
from flask import json, request, Blueprint, Flask import os import ast import numpy as np import pandas as pd import random import tensorflow as tf from sklearn.preprocessing import MultiLabelBinarizer from RecommendationModel import RecommendationModel from DataPreprocessing import DataPreprocessing app = Flask(__name__) #mod = Blueprint('API', __name__) dataPreprocessing = DataPreprocessing() movie_csv, link_csv, userIds_csv = dataPreprocessing.load_data() rModel = RecommendationModel(dataPreprocessing.max_userId, dataPreprocessing.max_movieId, dataPreprocessing.k_factor) model = rModel.generate_embeddedModel() model.load_weights('weights_best_embedded.hdf5') global graph graph = tf.get_default_graph() def predict_rating(model, userid, movieid): return model.predict([np.array([userid]), np.array([movieid])])[0][0]
from DataVisualization import DataVisualization from DataMining import DataMining # Configure logger log = Log.setup_logger("main") log.info(Constants.INITIAL_MSG) # Start calculating execution time start_time = time.time() log.info(Constants.START_MSG) # Data Preprocessing Phase log.info(Constants.DATA_PREPROCESSING_MSG) dp = DataPreprocessing() time_series = dp.preprocessing() # Feature Engineering Phase log.info(Constants.FEATURE_ENGINEERING_MSG) fe = FeatureEngineering() new_time_series = fe.execute_feature_engineering(time_series) # new_time_series = dp.delete_column(time_series, ['Confirmed Cases', 'Deaths', 'Recovered Cases', 'Active Cases']) # Truncate zero values from the time series # new_time_series = dp.truncate_time_series(new_time_series, '26/02/2020') # Preliminary Analysis: Stationarity Check pa = PreliminaryAnalysis()
# -*- coding: utf-8 -*- """ Created on Sat Mar 7 13:30:29 2020 This file is used to test the Active Learning model. @author: Donovan """ from ML_Class import Active_ML_Model from DataPreprocessing import DataPreprocessing from sklearn.ensemble import RandomForestClassifier from SamplingMethods import lowestPercentage import pandas as pd preprocess = DataPreprocessing(True) ml_classifier = RandomForestClassifier() sampling_method = lowestPercentage file_name = 'csvOut.csv' data = pd.read_csv(file_name, index_col=0, header=None) data = data.iloc[:, :-1] corn_active_model = Active_ML_Model(data, ml_classifier, preprocess) stop = False while stop == False: corn_active_model.Continue(sampling_method) stop = True
# -*- coding: utf-8 -*- """ Created on Mon Apr 2 00:44:45 2018 @author: shifuddin """ from DataPreprocessing import DataPreprocessing preprocessing = DataPreprocessing('Mall_Customers.csv') features, outcome = preprocessing.read_data(2,4,4) from sklearn.cluster import KMeans import numpy as np import matplotlib.pyplot as plt wcss = [] for i in range(1, 11): kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42) kmeans.fit(features) wcss.append(kmeans.inertia_) plt.plot(range(1, 11), wcss) plt.title('The Elbow Method') plt.xlabel('Number of clusters') plt.ylabel('WCSS') plt.show() # Fitting K-Means to the dataset kmeans = KMeans(n_clusters = 5, init = 'k-means++', random_state = 42) y_kmeans = kmeans.fit_predict(features)