예제 #1
0
def test_faces():
	grid_length = 4
	grid_width = 3

	train_prep = PreProcessor("./data/facedata/facedatatrain", "./data/facedata/facedatatrainlabels")
	add_partition_features(train_prep, grid_length, grid_width)
	reduced_train_prep = [x[-1 * grid_length * grid_width:] for x in train_prep.X]

	valid_prep = PreProcessor("./data/facedata/facedatavalidation", "./data/facedata/facedatavalidationlabels")
	add_partition_features(valid_prep, grid_length, grid_width)
	reduced_valid_prep = [x[-1 * grid_length * grid_width:] for x in valid_prep.X]
	kNN = KNearestNeighbors(dist_metric = "euclidean")

	"""
	# TUNING PROCEDURE #
	step_size = 1
	start_point = 1
	end_point = 31
	optimal_k, valid_accuracy = kNN.get_optimal_hparam(reduced_train_prep, train_prep.y, reduced_valid_prep, valid_prep.y, 
														start_point, end_point, step_size)
	print("Optimal k: {}\nAccuracy: {}".format(optimal_k, valid_accuracy)) # 6
	"""
	
	
	kNN.train(reduced_train_prep, train_prep.y, k = 6)
	# train_pred_list = kNN.predict(reduced_train_prep)
	# print("Training Accuracy: {}".format(kNN.compute_accuracy(kNN.y, train_pred_list)))
	valid_pred_list = kNN.predict(reduced_valid_prep)
	print("Validation Accuracy: {}".format(kNN.compute_accuracy(valid_prep.y, valid_pred_list)))
	
	
	"""
예제 #2
0
class trainer():
	def __init__(self,state):
		self.state = state
		self.dataset_dir = self.state.get('dataset_dir','')
		self.list_dir = os.path.join(self.dataset_dir,'lists')
		self.lists = {}
		self.lists['train'] = os.path.join(self.list_dir,'train_1_of_1.txt')
		self.lists['valid'] = os.path.join(self.list_dir,'valid_1_of_1.txt')
		self.lists['test'] = os.path.join(self.list_dir,'test_1_of_1.txt')
		self.preprocessor = PreProcessor(self.dataset_dir) 
		print 'Preparing train/valid/test splits'
		self.preprocessor.prepare_fold(self.lists['train'],self.lists['valid'],self.lists['test'])
		self.data = self.preprocessor.data
		self.targets = self.preprocessor.targets
		print 'Building model.'
		self.model = MLP(n_inputs=self.state.get('n_inputs',513),n_outputs=self.state.get('n_ouputs',10),
						 n_hidden=self.state.get('n_hidden',[50]),activation=self.state.get('activation','sigmoid'),
						 output_layer=self.state.get('sigmoid','sigmoid'),dropout_rates=self.state.get('dropout_rates',None))

	def train(self,):
		print 'Starting training.'
		print 'Initializing train dataset.'
		self.batch_size = self.state.get('batch_size',20)
		train_set = Dataset([self.data['train']],batch_size=self.batch_size,targets=[self.targets['train']])
		print 'Initializing valid dataset.'
		valid_set = Dataset([self.data['valid']],batch_size=self.batch_size,targets=[self.targets['valid']])
		self.optimizer = SGD_Optimizer(self.model.params,[self.model.x,self.model.y],[self.model.cost,self.model.acc],momentum=self.state.get('momentum',False))
		lr = self.state.get('learning_rate',0.1)
		num_epochs = self.state.get('num_epochs',200)
		save = self.state.get('save',False)
		mom_rate = self.state.get('mom_rate',None)
		self.optimizer.train(train_set,valid_set,learning_rate=lr,num_epochs=num_epochs,save=save,mom_rate=mom_rate)
예제 #3
0
def add_partition_features(prep, grid_length, grid_width):
	feat_dicts = [] # length of list is number of obs, each element is dictionary of agged feature
	for i in range(len(prep.raw_data)):
		mapped_image = prep.get_mapped_image(i)
		part_dict = PreProcessor.partition_image(mapped_image, grid_length = grid_length, grid_width = grid_width)
		feat_dicts.append(PreProcessor.agg_partitions(part_dict))
	key_list = list(feat_dicts[0].keys())
	for key in key_list:
		prep.add_feature(None, [feat_dicts[i][key] for i in range(len(feat_dicts))])
    def test_推論時にget_datasetで辞書型にXが想定サンプル分ndarrayがあればTrue(self, do):
        from preprocessing import PreProcessor
        pp = PreProcessor(do.valid_config_path, mode='pred')
        dummy_df = do.dummy_valid_df
        dataset = pp.get_dataset(dummy_df)

        print('\n', dataset)

        errmsg = '特徴量Xのサンプル数が入力データのサンプル数と一致しない'
        assert dataset['X'].shape[0] == dummy_df.values.shape[0], errmsg
예제 #5
0
class trainer():
    def __init__(self, state):
        self.state = state
        self.dataset_dir = self.state.get('dataset_dir', '')
        self.list_dir = os.path.join(self.dataset_dir, 'lists')
        self.lists = {}
        self.lists['train'] = os.path.join(self.list_dir, 'train_1_of_1.txt')
        self.lists['valid'] = os.path.join(self.list_dir, 'valid_1_of_1.txt')
        self.lists['test'] = os.path.join(self.list_dir, 'test_1_of_1.txt')
        self.preprocessor = PreProcessor(self.dataset_dir)
        print 'Preparing train/valid/test splits'
        self.preprocessor.prepare_fold(self.lists['train'],
                                       self.lists['valid'], self.lists['test'])
        self.data = self.preprocessor.data
        self.targets = self.preprocessor.targets
        print 'Building model.'
        self.model = MLP(n_inputs=self.state.get('n_inputs', 513),
                         n_outputs=self.state.get('n_ouputs', 10),
                         n_hidden=self.state.get('n_hidden', [50]),
                         activation=self.state.get('activation', 'sigmoid'),
                         output_layer=self.state.get('sigmoid', 'sigmoid'),
                         dropout_rates=self.state.get('dropout_rates', None))

    def train(self, ):
        print 'Starting training.'
        print 'Initializing train dataset.'
        self.batch_size = self.state.get('batch_size', 20)
        train_set = Dataset([self.data['train']],
                            batch_size=self.batch_size,
                            targets=[self.targets['train']])
        print 'Initializing valid dataset.'
        valid_set = Dataset([self.data['valid']],
                            batch_size=self.batch_size,
                            targets=[self.targets['valid']])
        self.optimizer = SGD_Optimizer(self.model.params,
                                       [self.model.x, self.model.y],
                                       [self.model.cost, self.model.acc],
                                       momentum=self.state.get(
                                           'momentum', False))
        lr = self.state.get('learning_rate', 0.1)
        num_epochs = self.state.get('num_epochs', 200)
        save = self.state.get('save', False)
        mom_rate = self.state.get('mom_rate', None)
        self.optimizer.train(train_set,
                             valid_set,
                             learning_rate=lr,
                             num_epochs=num_epochs,
                             save=save,
                             mom_rate=mom_rate)
예제 #6
0
 def __init__(self, state, rand):
     self.state = state
     self.dataset_dir = self.state.get('dataset_dir', '')
     self.list_dir = os.path.join(self.dataset_dir, 'lists')
     self.lists = {}
     self.lists['train'] = os.path.join(self.list_dir, 'train_1_of_1.txt')
     self.lists['valid'] = os.path.join(self.list_dir, 'valid_1_of_1.txt')
     self.lists['test'] = os.path.join(self.list_dir, 'test_1_of_1.txt')
     self.preprocessor = PreProcessor(self.dataset_dir)
     print 'Preparing train/valid/test splits'
     self.preprocessor.prepare_fold(
         self.lists['train'], self.lists['valid'], self.lists['test'])
     self.data = self.preprocessor.data
     self.targets = self.preprocessor.targets
     print 'Building model.'
     if self.state.get('model', 'MLP') == 'MLP':
         self.model = MLP(rand, n_inputs=self.state.get('n_inputs', 513),
                          n_outputs=self.state.get('n_ouputs', 10),
                          n_hidden=self.state.get('n_hidden', [50]),
                          activation=self.state.get(
                              'activation', 'sigmoid'),
                          output_layer=self.state.get('sigmoid', 'sigmoid'),
                          dropout_rates=self.state.get('dropout_rates', None))
     elif self.state.get('model') == 'LR':
         self.model = LR(rand, n_inputs=self.state.get('n_inputs', 513),
                         n_outputs=self.state.get('n_ouputs', 10),
                         activation=self.state.get('activation', 'sigmoid'),
                         output_layer=self.state.get('sigmoid', 'sigmoid'))
 def test_不正なmode引数ならinstance生成失敗でValueErrorを返す(self, do):
     from preprocessing import PreProcessor
     try:
         pp_invalid_mode = PreProcessor(do.valid_config_path,
                                        mode='invalid')
     except ValueError:
         pp_invalid_mode = None
     assert pp_invalid_mode is None
예제 #8
0
def data_cleaning(data, type=TYPE):
    df = data.copy()
    if DESC_COL in df:
        df[DESC_COL] = df[DESC_COL].apply(
            lambda x: re.sub('[0-9]+', '', str(x)))
    print 'class instantiation'
    if type == 'tickets':
        preprocessor = PreProcessor(email_cleaning=True,
                                    custom_cleaning=True,
                                    note_flag=False)
    else:
        print 'include source and subject columns in data'
        df[SOURCE_COL] = 1
        df[SUBJECT_COL] = ''
        preprocessor = PreProcessor(email_cleaning=True,
                                    custom_cleaning=True,
                                    desc_col='body',
                                    note_flag=True)
    df1 = preprocessor._cleaning(df)
    return df1
예제 #9
0
 def __init__(self, state):
     self.state = state
     self.dataset_dir = self.state.get('dataset_dir', '')
     self.list_dir = os.path.join(self.dataset_dir, 'lists')
     self.lists = {}
     self.lists['train'] = os.path.join(self.list_dir, 'train_1_of_1.txt')
     self.lists['valid'] = os.path.join(self.list_dir, 'valid_1_of_1.txt')
     self.lists['test'] = os.path.join(self.list_dir, 'test_1_of_1.txt')
     self.preprocessor = PreProcessor(self.dataset_dir)
     print 'Preparing train/valid/test splits'
     self.preprocessor.prepare_fold(self.lists['train'],
                                    self.lists['valid'], self.lists['test'])
     self.data = self.preprocessor.data
     self.targets = self.preprocessor.targets
     print 'Building model.'
     self.model = MLP(n_inputs=self.state.get('n_inputs', 513),
                      n_outputs=self.state.get('n_ouputs', 10),
                      n_hidden=self.state.get('n_hidden', [50]),
                      activation=self.state.get('activation', 'sigmoid'),
                      output_layer=self.state.get('sigmoid', 'sigmoid'),
                      dropout_rates=self.state.get('dropout_rates', None))
예제 #10
0
    def process_signal(self):
        self.counter += 1
        self.output_buffer = np.zeros([self.input_buffer.shape[0], self.feature_vector_size])
        threads = []
        thread_list = [i for i in range(0, self.number_of_threads)]
        for thread_id in thread_list:
            thread = PreProcessor(thread_id, self.input_buffer, self.output_buffer, config=self.config)
            thread.start()
            threads.append(thread)
        for t in threads:
            t.join()
        # with open(self.train_dir + "/feature_vectors.csv", 'a') as f:
        #         np.savetxt(f, self.output_buffer, delimiter=',', fmt='%.18e')

        clip_label = get_label(1, self.number_of_class)
        clip_filename = draw_sample_plot_and_save(self.output_buffer.flatten(), "/channel", self.thread_id, self.config)
        sample = create_sample_from_image(clip_filename, clip_label, self.config)
        # sample = create_sample_from_data(self.output_buffer.flatten(), class_label)
        self.writer.write(sample.SerializeToString())
        self.send_noise_data(json.dumps(self.input_buffer.tolist()))
        self.send_preprocessed_data(json.dumps(self.output_buffer.tolist()))
def data_prep(size=5000, train_file_path='data/train.csv', split=True, remove=None):
    """
    Data preprocessing helper function for local running of the ensemble.
    INPUTS:
    size (int) - number of rows of the train data to use
    train_file_path (string) - filepath to location of train data (as csv)
    split (bool) - whether to split the data into train and test components or
                    leave as one unit.
    """
    # prepare data for modeling
    print("Loading data...")
    train = pd.read_csv(train_file_path)
    if size > len(train):
        df = train
    df= train[:size]

    print("Preprocessing...")
    P = PreProcessor()

    if remove:
        remove_most = remove
    else:
        remove_most = ['Unnamed: 0', 'annotations', 'archived', 'author', 'date', \
                   'distinguished', 'edited','gilded', 'in_reply_to',
                   'is_first_post', 'link_id', 'link_id_ann', 'majority_link', \
                   'name', 'parent_id', 'replies', 'retrieved_on', 'saved', \
                   'score_hidden', 'subreddit', 'title', 'user_reports', \
                   'ann_1', 'ann_2', 'ann_3']

    if split:
        # make splits
        print("Splitting...")
        df_train, df_test = train_test_split(df, test_size=0.25)
        df_train = P.run(df_train, 'body', cols_to_drop=remove_most, direct_to_model=False)
        df_test = P.run(df_test, 'body', cols_to_drop=remove_most, direct_to_model=False)
        return df_train, df_test
    else:
        df_train = P.run(df, 'body', cols_to_drop=remove_most, direct_to_model=False)
        return df_train
예제 #12
0
def test_digits():
	train_prep = PreProcessor("./data/digitdata/trainingimages", "./data/digitdata/traininglabels", face_data = False)
	valid_prep = PreProcessor("./data/digitdata/validationimages", "./data/digitdata/validationlabels", face_data = False)
	nb = NaiveBayes(train_prep.X_domain, train_prep.y_domain)

	"""
	# TUNING PROCEDURE #
	step_size = 0.005
	start_point = 0.005
	end_point = 0.035
	optimal_k, valid_accuracy = nb.get_optimal_hparam(train_prep.X, train_prep.y, valid_prep.X, valid_prep.y, start_point, end_point, step_size, print_progress = True)
	print("Optimal k: {}\nAccuracy: {}".format(optimal_k, valid_accuracy)) # 0.02
	"""

	nb.train(train_prep.X, train_prep.y, k = 0.02)
	train_pred_list = nb.predict(train_prep.X)
	valid_pred_list = nb.predict(valid_prep.X)

	print("Training Accuracy: {}".format(nb.compute_accuracy(nb.y, train_pred_list)))
	print("Validation Accuracy: {}".format(nb.compute_accuracy(valid_prep.y, valid_pred_list)))

	"""
예제 #13
0
def test_faces():
	train_prep = PreProcessor("./data/facedata/facedatatrain", "./data/facedata/facedatatrainlabels")
	valid_prep = PreProcessor("./data/facedata/facedatavalidation", "./data/facedata/facedatavalidationlabels")
	nb = NaiveBayes(train_prep.X_domain, train_prep.y_domain)

	"""
	# TUNING PROCEDURE #
	step_size = 0.5
	start_point = 0.5
	end_point = 1.5 # 10
	optimal_k, valid_accuracy = nb.get_optimal_hparam(train_prep.X, train_prep.y, valid_prep.X, valid_prep.y, start_point, end_point, step_size)
	print("Optimal k: {}\nAccuracy: {}".format(optimal_k, valid_accuracy)) # 2.5
	"""
	

	nb.train(train_prep.X, train_prep.y, k = 2.5)
	train_pred_list = nb.predict(train_prep.X)
	valid_pred_list = nb.predict(valid_prep.X)

	print("Training Accuracy: {}".format(nb.compute_accuracy(nb.y, train_pred_list)))
	print("Validation Accuracy: {}".format(nb.compute_accuracy(valid_prep.y, valid_pred_list)))

	
	"""
예제 #14
0
    def do_inference(cls, request_body):
        """与えられたdictを取り出して,
        apps内の各種MLパイプランに乗せて,最終推論結果を返す
        
        Parameters
        ----------
        request_body : dict
            リクエストbody
        
        Returns
        -------
        dict
            レスポンスbody
        """
        label = 'Survived'

        from utils import Utils
        logger = Utils.init_logger('predicton_sample')

        logger.info('前処理・特徴量エンジニアリング')
        from preprocessing import PreProcessor
        pred_df = pd.DataFrame(request_body)
        pp = PreProcessor(config_path=cls.config_path,
                          mode='pred',
                          label=label)
        pred_dataset = pp.get_dataset(pred_df)

        logger.info('推論')
        cls.load_models()
        from model import Model
        m = Model(config_path=cls.config_path, mode='pred')
        result = m.predict(pred_dataset)

        response_body = {label: result['y']}

        return response_body
예제 #15
0
	def __init__(self,state):
		self.state = state
		self.dataset_dir = self.state.get('dataset_dir','')
		self.list_dir = os.path.join(self.dataset_dir,'lists')
		self.lists = {}
		self.lists['train'] = os.path.join(self.list_dir,'train_1_of_1.txt')
		self.lists['valid'] = os.path.join(self.list_dir,'valid_1_of_1.txt')
		self.lists['test'] = os.path.join(self.list_dir,'test_1_of_1.txt')
		self.preprocessor = PreProcessor(self.dataset_dir) 
		print '++++++++++++++1.Preparing train/valid/test splits'
		self.preprocessor.prepare_fold(self.lists['train'],self.lists['valid'],self.lists['test'])
		self.data = self.preprocessor.data
		self.targets = self.preprocessor.targets
		print '++++++++++++2.Building model.'
		print "++++++++++++3.please not the param inputs* and outputs* and hidden*  "
		self.model = MLP(n_inputs=self.state.get('n_inputs',513),n_outputs=self.state.get('n_ouputs',3),
						 n_hidden=self.state.get('n_hidden',[3]),activation=self.state.get('activation','sigmoid'),
						 output_layer=self.state.get('sigmoid','sigmoid'),dropout_rates=self.state.get('dropout_rates',None))
예제 #16
0
def main():
	train_prep_face = PreProcessor("./data/facedata/facedatatrain", "./data/facedata/facedatatrainlabels")
	valid_prep_face = PreProcessor("./data/facedata/facedatavalidation", "./data/facedata/facedatavalidationlabels")
	test_prep_face = PreProcessor("./data/facedata/facedatatest", "./data/facedata/facedatatestlabels")
	train_prep_digit = PreProcessor("./data/digitdata/trainingimages", "./data/digitdata/traininglabels", face_data = False)
	valid_prep_digit = PreProcessor("./data/digitdata/validationimages", "./data/digitdata/validationlabels", face_data = False)
	test_prep_digit = PreProcessor("./data/digitdata/testimages", "./data/digitdata/testlabels", face_data = False)
	print("Naive Bayes")
	print("=" * 80)
	print("Face Recognition")
	print("-" * 80)
	face_nb = NaiveBayes(train_prep_face.X_domain, train_prep_face.y_domain)
	print_results(face_nb, train_prep_face.X, train_prep_face.y, test_prep_face.X, test_prep_face.y, k = 2.5)
	print("Digit Recognition")
	print("-" * 80)
	digit_nb = NaiveBayes(train_prep_digit.X_domain, train_prep_digit.y_domain)
	print_results(digit_nb, train_prep_digit.X, train_prep_digit.y, test_prep_digit.X, test_prep_digit.y, k = 0.02)

	"""
예제 #17
0
    cm.create_config(config_path_wo_datetime)
    config_path = cm.dst_path
    s3_dst_info = {
        's3_config': {
            'aws_profile': args.profile,
            'bucket_name': args.output_s3bucket,
            'path_s3_dst': child_dir
        }
    }
    cm.add_info(config_path, s3_dst_info)

    logger.info('学習データのロード')
    train_df = load_train_data(args.input_path)

    logger.info('前処理・特徴量エンジニアリング')
    pp = PreProcessor(config_path=config_path, mode='train', label='Survived')
    train_dataset = pp.get_dataset(train_df)
    pp.save_transformers(child_dir=child_dir,
                         transformers_name='sample_transformers.pkl.cmp')

    logger.info('学習')
    m = Model(config_path=config_path, mode='train')
    m.init_model()
    m.train_with_cv(train_dataset)
    m.save_model(dst_dir=model_dir,
                 child_dir=child_dir,
                 model_name='sample_model.pkl.cmp')

    logger.info('推論時に利用する各種ファイルをS3にUpload')
    # S3のUpload対象にconfigと学習データも含めたいとき,以下の処理を行う
    # 1. configと学習データの情報を更新
예제 #18
0
    },
    "XGBoost": {
        "model": XGBRegressor(),
        "params": {
            "gamma": np.random.uniform(low=0.01, high=0.05, size=10),
            "max_depth": [4, 5, 6],
            "min_child_weight": [4, 5, 6],
            "reg_alpha": [1e-5, 1e-2, 0.1, 1, 10, 100]
        }
    }
}

if __name__ == "__main__":
    df = load_sol_challenge()
    # Data Preprocessing
    preprocessor = PreProcessor()
    df = preprocessor.str_to_float(df, cols_with_str)
    df = preprocessor.remove_nans(df)

    # EDA
    # Data Distribution
    data_distribution = DataDistribution(cols_to_analyse,
                                         PATH_RESULTS_EDA_DIST,
                                         ignore_outliers=False)
    data_distribution.run(df)
    # Feature Correlation
    feature_correlation = FeatureCorrelation(cols_to_analyse,
                                             PATH_RESULTS_EDA_CORR,
                                             figsize=(9, 9))
    feature_correlation.run(df)
def main(size=5000, grid=False):
    """
    Composite function designed for running tests.
    INPUTS:
    size (int) - number of rows of the data set to use
    grid (bool) - whether or not to grid search
    OUTPUTS:
    None
    """
    # prepare data for modeling
    print("Loading data...")
    train = pd.read_csv('data/train.csv')
    if size > len(train):
        df = train
    df = train[:size]

    #make splits
    print("Splitting...")
    df_train, df_test = train_test_split(df, test_size=0.20)

    print("Preprocessing...")
    P = PreProcessor()
    remove_all_but_text = None

    remove_most = ['Unnamed: 0', 'annotations', 'archived', 'author', 'date', \
                   'distinguished', 'edited','gilded', 'in_reply_to',
                   'is_first_post', 'link_id', 'link_id_ann', 'majority_link', \
                   'name', 'parent_id', 'replies', 'retrieved_on', 'saved', \
                   'score_hidden', 'subreddit', 'title', 'user_reports', \
                   'ann_1', 'ann_2', 'ann_3']

    X_train, y_train = P.run(df_train,
                             'body',
                             cols_to_drop=remove_most,
                             direct_to_model=True)
    X_test, y_test = P.run(df_test,
                           'body',
                           cols_to_drop=remove_most,
                           direct_to_model=True)

    # establish baseline models
    baseline_scores = run_baseline_modeling(X_train, y_train, X_test, y_test)
    # look at basic NB model results (reduced to NB)
    nb_models, NB_base_scores = run_basic_nb_models(X_train, y_train, X_test,
                                                    y_test)

    if grid:
        #run grid search
        run_alt_model_tests(X_train, y_train, X_test, y_test)
    else:
        # look at basic model scores
        alt_models, alt_scores = run_alt_models(X_train, y_train, X_test,
                                                y_test)
        print("\n\nBaseline Scores: ")
        for n, s in zip(['Weighted Guess', 'Guess Most Frequent'],
                        baseline_scores):
            print("{}: {}".format(n, s))
        print("Naive Bayes Scores")
        for n, s in zip(['Naive Bayes', 'Multinomial Bayes'], NB_base_scores):
            print("{}: {}".format(n, s))
        print("Other model Scores: ")
        for n, s in zip([
                'Logistic Regression', 'Random Forest', 'Gradient Boosting',
                'Adaboost'
        ], alt_scores):
            print("{}: {}".format(n, s))
예제 #20
0
import numpy as np

from preprocessing import PreProcessor
# from classifier import FaceClassifier
# from embedded_data import Embedding

# ---
from cnn_classifier import FaceClassifier

from datetime import datetime

cap = cv2.VideoCapture(0)
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 640)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 480)

p = PreProcessor()
# embedder = Embedding()
# classifier = FaceClassifier()
# data = np.load('data/dataset.npz')
# label = np.load('data/label.npz')
#
# embeded = np.load('data/embedded_images.npz')
# label = label['arr_0']
# embeded = embeded['arr_0']
# --------------------------------------------
# classifier = FaceClassifier()
#
# classifier.set_model_path(ROOT_DIR + '/models/model.pkl')
# classifier.set_label_path(ROOT_DIR + '/models/labels.json')
# classifier.set_data_train_path(ROOT_DIR + '/data/embedded_faces.npz')
# classifier.train()
예제 #21
0
        X = X[self.raw_features].copy()
        X_pp = self.preprocessing_pipeline.transform(X)
        X_pp = pd.DataFrame(X_pp, columns=self.get_feature_names())

        return X_pp

    def get_feature_names(self):
        """
        Feature names after preprocessing.
        Replicates the get_feature_names function in the sklearn Transformer classes.
        """
        return self.raw_features


if __name__ == "__main__":
    from preprocessing import PreProcessor  # noqa

    # Load data
    db_config = db.get_config()
    train = db.load(*db_config, 'raw_train')
    X_train = train.drop('SalePrice', axis=1)

    # Fit and transform training data
    pp = PreProcessor()
    X_train_pp = pp.fit_transform(X_train)
    train_pp = X_train_pp.assign(SalePrice=train['SalePrice'])

    # Save preprocessed data and fitted preprocessor
    db.save(train_pp, *db_config, 'processed_train')
    joblib.dump(pp, os.path.join(DIR, '../pickle/PreProcessor.pkl'))
예제 #22
0
    },
    "XGBoost": {
        "model": XGBRegressor(),
        "params": {
            "gamma": np.random.uniform(low=0.01, high=0.05, size=10),
            "max_depth": [4, 5, 6],
            "min_child_weight": [4, 5, 6],
            "reg_alpha": [1e-5, 1e-2, 0.1, 1, 10, 100]
        }
    }
}

if __name__ == "__main__":
    df = load_wang_data()
    # Data Preprocessing
    preprocessor = PreProcessor()
    df = preprocessor.remove_nans(df)

    # EDA
    # Data Distribution
    data_distribution = DataDistribution(cols_to_analyse,
                                         PATH_RESULTS_EDA_DIST_WANG_DATA,
                                         ignore_outliers=False)
    data_distribution.run(df)
    # Feature Correlation
    feature_correlation = FeatureCorrelation(cols_to_analyse,
                                             PATH_RESULTS_EDA_CORR_WANG_DATA,
                                             figsize=(9, 9))
    feature_correlation.run(df)
    # Decrease dataset size - just for testing purposes
    df = df[:2000]
예제 #23
0
파일: main.py 프로젝트: rubiruchi/SensorWeb
logging.basicConfig(level=logging.DEBUG)  #, filename="error.log")

# Make shell script option parser
parser = argparse.ArgumentParser()
parser.add_argument("--train",
                    help="Run model training loops and save models \
                     in models/ directory.",
                    action="store_true")
parser.add_argument("--test",
                    help="Run testing for all models saved in \
                     models/ directory.",
                    action="store_true")
args = parser.parse_args()

# Get datasets
pp = PreProcessor(data_dir="../data/")
train, val, test = pp.get_data()

if args.train:

    # Prepare models for training.
    timesteps = [2, 5, 10, 20, 50, 100, 200]
    models = [LinearModel, Layer1NN, Layer2NN, Layer3NN]
    _trainer = Trainer(timesteps, models, train, val)
    _trainer.train_loop()

elif args.test:

    del train
    del val
    def test_mode引数を正しく指定できればinstance生成成功(self, do, mode):
        from preprocessing import PreProcessor

        pp = PreProcessor(do.valid_config_path, mode=mode)
        assert pp
 def ppmock(self, do):
     from preprocessing import PreProcessor
     pp = PreProcessor(do.valid_config_path, mode='train')
     pp.config = do.dummy_config
     return pp