def train_and_test_separate_files(train_path, test_path): """ This function trains the model using the training data given in the path above (train_path). This also evaluates the test data specified in the path above (test_path). :param train_path: The path to the training data. This is located in the "input" folder. :type train_path: str :param test_path: The path to the test data. This also is located in the "input" folder. :type test_path: str :return: None. It just prints out the accuracy of the model. :rtype: None """ tr_reviews, tr_labels = Preprocessing().get_data(train_path) X_train, y_train = np.array(tr_reviews), np.array(tr_labels) model, sf, tr_vecs, imp = generate_model(X_train, y_train) test_reviews, test_labels = Preprocessing().get_data(test_path) X_test, y_test = np.array(test_reviews), np.array(test_labels) y_pred = test_model(model, sf, tr_vecs, imp, X_test) accuracy = accuracy_score(y_test, y_pred) print('Accuracy: {:.2f}%'.format(accuracy * 100))
def product_scenario(self): _config_before = config_train('before', 'PALSAR') _log_file = _config_before.LOG_DIR + '/' + _config_before.LOG_FILENAME self.__logger = Logger('LandslideWatcher', _config_before.LOG_OUTPUT, _log_file, _config_before.LOG_LEVEL, _config_before.LOG_FORMAT) _preprc_before = Preprocessing(_config_before, self.__logger) _preprc_before.run() _config_after = config_train('after', 'PALSAR') _preprc_after = Preprocessing(_config_after, self.__logger) _preprc_after.run()
def model_training(self): ''' Train the model ''' pre = Preprocessing() print('Loading data') df = self.data.read_data(etapa_treino=True) print('Training preprocessing') #Dataset splited and processed X, y, features = pre.process(df, etapa_treino=True) #Standardized with scaler scaler = StandardScaler() scaled = scaler.fit_transform(X, y) #Create model linear_regression_model = linear_model.LinearRegression() rf = RandomForestRegressor() #Train data model = rf.fit(X, y) return model, features '''
def create_dictionary(in_file = None, passage = None): if(in_file != None and os.path.exists(in_file)): corpus_lines = open(in_file, 'r', encoding='utf-8').read().split("\n") elif(passage != None): corpus_lines = passage.split("\n") else: print("Invalid input!") return d = Dictionary() p = Preprocessing() if(d.database_exists(d.DB_DICTIONARY)): return False for line in corpus_lines: words = p.fetch_line_words(line) for word in words: main_word = re.sub(r"[^-A-Za-z0-9]", '', word[0]) root = p.fetch_lemmatized_word(main_word, word[1]) d.prepare_word2dic(main_word, root) return d.store_prepared_data()
def create_bigram(in_file = None, passage = None): if(in_file != None and os.path.exists(in_file)): corpus_lines = open(in_file, 'r', encoding='utf-8').read().split("\n") elif(passage != None): corpus_lines = passage.split("\n") else: print("Invalid input!") return d = Dictionary() p = Preprocessing() if(d.database_exists(d.DB_BIGRAM)): return False for line in corpus_lines: words = p.fetch_line_words(line, escape_symbols=False) prev_word = (None, None) for word in words: d.prepare_bigram2dic(word, prev_word) prev_word = word return d.store_prepared_data()
def main(): """ Main function to extract features. """ args = parse_args() cfg = load_config(args) # Set random seed from configs. np.random.seed(cfg.RNG_SEED) th.manual_seed(cfg.RNG_SEED) failed_log = open(args.csv.split(".csv")[0]+"_failed.txt", "w") assert args.target_framerate % args.min_num_features == 0 preprocess = Preprocessing( "3d", cfg, target_fps=args.target_framerate, size=224, clip_len=args.clip_len, padding_mode='tile', min_num_clips=args.min_num_features) if args.dataflow: readvideo = ReadVideo( preprocess, framerate=args.target_framerate, size=224, centercrop=True, pix_fmt=args.pix_fmt, overwrite=args.overwrite ) dataset = VideoDataFlow(args.csv) # dataset = MultiThreadMapData( # dataset, num_thread=args.num_decoding_thread, # map_func=readvideo, # buffer_size=1000) # loader = MultiProcessRunnerZMQ( # dataset, num_proc=1) loader = MultiProcessMapDataZMQ( dataset, num_proc=args.num_decoding_thread, map_func=readvideo, strict=True) loader.reset_state() n_dataset = len(dataset) else: dataset = VideoLoader( args.csv, preprocess, framerate=args.target_framerate, size=224, centercrop=True, pix_fmt=args.pix_fmt, overwrite=args.overwrite ) n_dataset = len(dataset) sampler = RandomSequenceSampler(n_dataset, 10) loader = DataLoader( dataset, batch_size=1, shuffle=False, num_workers=args.num_decoding_thread, sampler=sampler if n_dataset > 10 else None, ) model = build_model(cfg) perform_test( loader, model, preprocess, cfg, args, failed_log, n_dataset)
def __init__(self, train_features_dir, val_features_dir, test_audios_dir, num_epochs, train_batch_size, val_batch_size, learning_rate, base_dir, max_to_keep, model_name): #test_features_dir, test_batch_size self.data_loader = DataLoader(train_features_dir, val_features_dir, train_batch_size, val_batch_size) self.test_audios_dir = test_audios_dir self.train_batch_size = train_batch_size self.val_batch_size = val_batch_size self.learning_rate = learning_rate self.num_epochs = num_epochs self.base_dir = base_dir self.model_name = model_name self.max_to_keep = max_to_keep self.checkpoint_dir = os.path.join(self.base_dir, self.model_name, "checkpoints") if not os.path.exists(self.checkpoint_dir): os.makedirs(self.checkpoint_dir) self.summary_dir = os.path.join(self.base_dir, self.model_name, "summaries") if not os.path.exists(self.summary_dir): os.makedirs(self.summary_dir) prep = Preprocessing() self.sample_rate = prep.sample_rate self.frame_dim = int(prep.frame_len / 2) + 1 self.frame_count = prep.frame_count self.preprocess_test_data = prep.preprocess_test_data self.preprocess_test_data_unet = prep.preprocess_test_data_unet self.produce_time_outputs = prep.produce_time_outputs self.time_outputs_into_track = prep.time_outputs_into_track self.produce_time_outputs_unet = prep.produce_time_outputs_unet self.time_outputs_into_track_unet = prep.time_outputs_into_track_unet
def main(): image_shape = (128, 128, 3) datadir = "/home/awolfert/projects/engagement-l2tor/data/emotions/" prep = Preprocessing(datadir, "x_train3.txt", "x_test3.txt", "x_val3.txt",\ "y_train3.txt", "y_test3.txt", "y_val3.txt") x_train, y_train, x_val, y_val = prep.getTrainData(trim=True, img_shape=image_shape) train(x_train, y_train, x_val, y_val, image_shape)
def fit_model(self, df): start = time() logger.debug('Fitting model to data') # Preprocessing the dataset (cleaning, reformatting) pp = Preprocessing() self.status = "preprocessing : cleaning, reformatting" pp.fit(df) train = pp.transform(df) logger.debug('DataFrame shape : %s, %s' % (train.shape[0], train.shape[1])) logger.debug('Target distribution: %s' % (train['product_rating'].value_counts())) # Tuning model's hyperparameters with a Bayesian Optimizer self.status = "tuning model's hyperparameters" params = self.model_tuning(train) # Fitting data to our model with updated parameters self.status = "fitting data to the model" xgbclf = xgb.XGBClassifier(**params) xgbclf.fit(train.drop('target', 1), train.target) logger.debug('Time elapsed : %0.3fs' % (time() - start)) # Saving model joblib.dump(xgbclf, folder_name + "xgb.model") joblib.dump(pp, folder_name + 'preprocessing.model') logger.debug('Model for XGBoost & preprocessing data saved.')
def run_experiment(self): ''' Run especified experiments :return: Dict with metrics ''' pre = Preprocessing() print('Reading Data') train_df = DataSource().read_data(etapa_treino=True) test_df, y_test = DataSource().read_data(etapa_treino=False) y_test = y_test['SalePrice'] print('Preprocessing Data') X_train, y_train = pre.process(train_df, etapa_treino=True) print('Processing Test Data') X_test = pre.process(test_df[pre.train_features], etapa_treino=False) print('Training Model') models = Experiments().train_model(X_train, y_train) print('Running Metrics') for model in models.keys(): print(model) y_pred = models[model].predict(X_test) print(Metrics().calculate_regression(y_test, pd.Series(y_pred))) metrics = Metrics().calculate_regression(y_test, pd.Series(y_pred)) pd.DataFrame.from_dict( metrics, orient='index').to_csv('../output/' + model + '.csv') return metrics
def run_parallel(olo, cv, cmplt): random.seed(SEED_NUMBER) makedirs(REPORT_PATH, exist_ok=True) makedirs(join(REPORT_PATH, DATA_HEADER), exist_ok=True) data = Preprocessing() data.main(olo, cv, cmplt) if data.data_train_folds: n_jobs = data.data_train_folds.__len__() else: n_jobs = AVG_COUNT start = time.time() arg_instances = [[idx, data] for idx in range(n_jobs)] results = Parallel(n_jobs=n_jobs, verbose=1, backend="multiprocessing")(map(delayed(handle_model), arg_instances)) end = time.time() print('multi-threading time = {:.3f}'.format((end - start) / 60)) ml_performance = [result[0] for result in results] class_precision = [result[1] for result in results] track_to_plot = [result[2] for result in results] avg_perf = avg_performance(ml_performance) print('Average ML performance:') [ print(metric + ' ' + str('%.5f' % val)) for metric, val in avg_perf.items() ] avg_precision = avg_performance(class_precision) plot_bar(avg_precision, 'precision') plot_records(track_to_plot)
def classifyWithModel(self, model, sentence, preprocess=None): if model != None: if preprocess is None: preprocess = Preprocessing() sentence = preprocess.process(sentence) sentence_split = sentence.split(" ") clas = {} for c in model['clas']: vj = model['prior'][c] for cc in sentence_split: if cc in model['cond_prob'][c]: vj *= model['cond_prob'][c][cc] clas[c] = vj i = 0 prev = 0 curr = 0 argmax = '' for c in model['clas']: curr = clas[c] if (curr > prev): argmax = c prev = curr print("Test data : ", sentence) print('Class : ', argmax) return argmax else: print("No model!") return False
def preprocessingText(self, doPreprocessing, progress, qc): if self.con != None: if self.training_table: self.dataTraining = self.con.getDataAsDF(self.training_table) progress.setValue(1) if self.dataTraining is not None: p = Preprocessing(con=self.con) progressP = 1 progressS = (99 - progressP) / len(self.dataTraining.index) for index, row in self.dataTraining.iterrows(): text = row[self.text_col] if doPreprocessing: pretext = p.process(text) pretext = pretext['stemmed_text'] else: pretext = p.processNoPre(text) self.dataTraining.at[index, self.text_col] = pretext progressP += progressS progress.setValue(progressP) qc.processEvents() qc.processEvents() progress.setValue(99) else: print("No training table!") progress.setValue(100)
def feature_size_acc(dataset, model, **kwargs): fractions = [0.2, 0.4, 0.6, 0.8, 1] accuracies = [] for frac in fractions: if dataset == Dataset.imdb_reviews: dataset_features = imdb_feature_n elif dataset == Dataset.twenty_news: dataset_features = twenty_features_n num_features = int(frac * dataset_features) preprocessor = Preprocessing(dataset, max_features=num_features) set_ = preprocessor.get_train_test() acc = evaluate_model(model, dataset, set_, verbose=False, show_plot=False, **kwargs) accuracies.append(acc) df = pd.DataFrame(columns=["Fraction of features", "Accuracy"]) df["Fraction of features"] = fractions df["Accuracy"] = accuracies df.plot.line(x="Fraction of features", y="Accuracy") plt.title("training set feature size vs accuracy {}-{}".format( dataset, model)) plt.show()
def classify(self,sentence,preprocess=None): if self.model != None: if preprocess is None: preprocess = Preprocessing() sentence = preprocess.process(sentence) sentence_split = sentence['stemmed_text'].split(" ") clas = {} for c in self.model['clas']: vj = self.model['prior'][c]; for cc in sentence_split: if cc in self.model['cond_prob'][c]: vj*=self.model['cond_prob'][c][cc] clas[c] = vj i = 0 prev = 0; curr = 0; argmax = '' for c in self.model['clas']: curr = clas[c]; if(curr > prev): argmax = c prev = curr print("Test data : ",sentence) print('Class : ',argmax) return argmax else: print("No model!") return False
def do(config): # 데이터 읽기 & 전처리 print("Read data") ds = Datasets(config.data_path) data = ds.read_data() print("Data preprocessing..") preprocessing = Preprocessing(config) X = preprocessing.do(data) print('Train model') if config.sg == 'CBOW': model = Word2Vec( sentences=X, size=config.size, window=config.window, min_count=config.min_count, workers=config.workers, sg=0 ) else: model = Word2Vec( sentences=X, size=config.size, window=config.window, min_count=config.min_count, workers=config.workers, sg=1 ) print(model.wv.vectors.shape) model.save(os.path.join(config.save_directory, config.ckpt_name))
def __init__(self, args): ''' run a DDQN training session, or test it's result, with the donkey simulator ''' self.args = args self = init_simulator(self) # Construct gym environment. Starts the simulator if path is given. self.env = gym.make( self.args.env_name, conf=self.conf) self.memory = deque(maxlen=10000) # Get size of state and action from environment self.state_size = (img_rows, img_cols, img_channels) self.action_space = self.env.action_space # Steering and Throttle self.agent = DQNAgent(self.state_size, self.action_space, input_shape=(img_rows, img_cols, img_channels), output_size=turn_bins, train=not args.test) self.preprocessing = Preprocessing() if os.path.exists(args.model): print("load the saved model") self.agent.load_model(args.model) try: self.run_ddqn() except KeyboardInterrupt: print("stopping run...") finally: self.env.unwrapped.close()
def main(): print(Welcome.WELCOME) #Init visualization v = Visualization() #Ask the data to load data_file = input(Data.Q_DATA_2_LOAD) #Save the data file to the preprocessing class pp = Preprocessing(data_file) #Ask if the user wants to see the raw data v.show_raw_information(pp.raw) #Ask if the user wants to see les raw data time pp.decrease_time_channels() #Ask and apply a notch filter if required if (pp.notch_filter() == Notch_filter.APPLY_NOTCH_FILTER): #If the user has applied the filter, we ask if wants to see the results v.plot_data(pp.raw) #Ask and apply a bandpass filter if required bandpass_filter(pp) #Ask and apply an ica filter if required if (pp.ica_filter(v) == ICA_filter.APPLY_ICA_FILTER): print("TODO: Crec que s'ha de treure")
def predict(): INV_CLASS = { 0: 'Black-grass', 1: 'Charlock', 2: 'Cleavers', 3: 'Common Chickweed', 4: 'Common wheat', 5: 'Fat Hen', 6: 'Loose Silky-bent', 7: 'Maize', 8: 'Scentless Mayweed', 9: 'Shepherds Purse', 10: 'Small-flowered Cranesbill', 11: 'Sugar beet' } preprocessing = Preprocessing() model = CNN_NET(4, 12, 0).to('cuda') predict_data = preprocessing.test_data_read() predict_input = torch.from_numpy(predict_data['image']) predict_dataset = torch.utils.data.TensorDataset(predict_input) predict_set = torch.utils.data.DataLoader(predict_dataset, batch_size=32) model.load_state_dict(torch.load('./model.pth')) prediction = predict(model, 'cuda', predict_set) predict_data['label'] = prediction with open('submission.csv', 'w', encoding='utf-8') as f: f.write('file,species' + '\n') for i in range(len(predict_data['id'])): f.write(predict_data['id'][i] + ',' + INV_CLASS[prediction[i]] + '\n')
def fine_tune(epochs=20, dropout=0.25, lr=1e-5): model = CNN_NET(4, 12, dropout).to('cuda') preprocessing = Preprocessing() data = preprocessing.read_image() data['image'] = torch.from_numpy(data['image']) data['label'] = torch.from_numpy(data['label']) train_set = torch.utils.data.TensorDataset(data['image'], data['label']) indices = np.random.randint(low=0, high=len(train_set) - 1, size=len(train_set)) test_sequence = torch.from_numpy(indices) test_set = torch.utils.data.Subset(train_set, test_sequence[0:400]) train_set_split = torch.utils.data.Subset(train_set, test_sequence[400:]) train_loader = torch.utils.data.DataLoader(dataset=train_set, batch_size=64, shuffle=True, num_workers=4) test_loader = torch.utils.data.DataLoader(dataset=test_set, batch_size=32) print('Total length: ', len(train_set)) print('train_length:', len(train_set_split)) print('test_length:', len(test_set)) try: model.load_state_dict(torch.load('./model.pth')) except FileNotFoundError: print('Initialization') optimizer = optim.Adam(model.parameters(), lr=lr) for epoch in range(epochs): train(model, 'cuda', train_loader, optimizer, epoch=10) test(model, 'cuda', test_loader) torch.save(model.state_dict(), './model.pth')
def get_data(): ''' Returns np array of labelled data ''' labels = [] x = [] test = [] paths = [] pp = Preprocessing() # get list of data files flare_list = pp.get_training_data(pp.flare_path) blurry_list = pp.get_training_data(pp.blurry_path) good_list = pp.get_training_data(pp.good_path) paths = [pp.flare_path] * len(flare_list) paths = paths + [pp.blurry_path] * len(blurry_list) paths = paths + [pp.good_path] * len(good_list) test = flare_list + blurry_list + good_list # label flare files as 1 and add to training set append_sets(flare_list, pp.flare_path, x, labels, 1) # label blurry files as 2 and add to training set append_sets(blurry_list, pp.blurry_path, x, labels, 2) # label good files as 3 and add to training set append_sets(good_list, pp.good_path, x, labels, 0) return np.float32(x), np.array(labels, dtype=np.int32), test, paths
def model_training(self): ''' Train the model. :return: Dict with trained model, preprocessing used and columns used in training ''' # no momento em que o modelo é treinado, a classe "Preprocessing" é chamada pre = Preprocessing() print('Loading data') # Leitura dos dados na etapa de treino df = self.data.read_data(etapa_treino=True) print('Training preprocessing') # Pré-processamento dos dados de treino ("X_train" e do "y_train") # os dados de treino vêm junto com a label X_train, y_train = pre.process(df, etapa_treino=True) print('Training Model') # Chamando uma regressão linear model_obj = LinearRegression() # Ajustamento do modelo aos dados (para o modelo começar a aprender com os dados) model_obj.fit(X_train, y_train) # Retorno da função "model_training" (a função retorna um dicionário) model = { 'model_obj': model_obj, # objeto do modelo treinado 'preprocessing': pre, # preprocessamento criado 'colunas': pre.feature_names } # nome das features que foram treinadas junto com o modelo print(model) # Salvando o output do modelo dump(model, '../output/modelo.pkl') return model
def __init__(self, lang_code, method="LSA", n_words=200, k=1, sv_threshold=0.5, min_df=0, max_df=.1, use_idf=True): self.lang_code = lang_code self.method = method self.n_words = n_words self.k = k # num topics self.sv_threshold = sv_threshold self.min_df = min_df self.max_df = max_df self.use_idf = use_idf self.valid_langs = ["en"] if self.lang_code in self.valid_langs: self.p = Preprocessing(lang_code=lang_code) self.tfidf = TfidfVectorizer(min_df=min_df, max_df=max_df, use_idf=use_idf)
def test_extract_bigrams(self): expected = [ "football_player", "goal_goal", "cup_league", "result_result" ] p = Preprocessing(lang_code='en') bigrams = p.extract_bigrams(self.token_list) self.assertTrue(isinstance(bigrams, list)) self.assertEqual(bigrams, expected)
def main(): preprocessing = Preprocessing(Config.train_poems_location) preprocessing.preprocess() model = PoemModel(preprocessed=preprocessing, weight_file=Config.weight_file, window_size=Config.window_size, learning_rate=0.001, batch_size=32)
def test_keep_valid_tokens(self): expected = [ "result", "goal", "game", "cup", "football_player", "cup_league" ] p = Preprocessing(lang_code=self.lang_code) valid_tokens = p.keep_valid_tokens(self.token_list) self.assertTrue('goal_goal' and 'result_result' not in valid_tokens) self.assertTrue(isinstance(valid_tokens, list)) self.assertEqual(valid_tokens, expected)
def save_stock(self, stock): with open("dataset/" + stock + ".csv", 'w+') as file: file.write("date,open,max,min,close,volume,y\n") prep = Preprocessing() close = np.array(self.data[stock])[:, 4] y = prep.create_train_result(close) for index in range(len(y)): x = ",".join([str(i) for i in self.data[stock][index]]) file.write(x + "," + str(y[index]) + "\n")
def main(args): pp = Preprocessing() # load data print("Loading Data.....\n\n") train_block, train_block_label = pp.read_train_file( args.train_data, args.train_label) test_block = pp.read_test_file(args.test_data) # explore data, do some visualization print("Exploring Data (see 'fig' folder for visualization) .....\n\n") viz = Visualization() # histogram for the lpc coefficient distribution viz.visualize_lpc_distribution(train_block) # histogram for the block length (or point of time) distribution viz.visualize_block_length_distribution(train_block) # plot one block of lpc coefficient for each speaker to look at the pattern of voice frequency viz.visualize_lpc_time_series(train_block) viz.visualize_fitted_lpc_series(train_block) max_length = 29 final_block_size = 18 print("Data Preprocessing (padding to fixed size blocks)....\n\n") # Take the best lengths (18), truncate the longer block, and pad the shorter block by the last row train_data = pp.pad_to_fixed_size_blocks(train_block, max_length, final_block_size) test_data = pp.pad_to_fixed_size_blocks(test_block, max_length, final_block_size) # dummy test label for convenience test_block_label = [[i] for i in np.zeros(len(test_data))] print("Generating Features (for ML Algorithms)... \n\n") # Generate fixed length feature vector for traditional machine learning input final_train_data = pp.convert_to_vectors(train_data, train_block_label, final_block_size) final_test_data = pp.convert_to_vectors(test_data, test_block_label, final_block_size) # See scatter plot to find out if there is grouping based on feature vector viz.lpc_scatter_plot(final_train_data) # Looks like there is a grouping, so let's try to classify using some popular algorithm model = Models() model.run_classification_models(final_train_data, final_test_data) print("SVM Prediction Saved (see 'results/submission.txt' )... \n\n") #Also try LSTM for classification model.run_LSTM_model(np.array(train_data), np.array(train_block_label), np.array(test_data)) print("LSTM Prediction Saved (see 'results/submission_lstm.txt' )... \n\n")
def preprocessing(self, doPreprocessing, doFeatureSelection, take_feature, threshold, progress, qc): features = None if self.con != None: if self.training_table: # self.dataTraining = self.con.getDataAsDF(self.training_table) progress.setValue(10) if self.dataTraining is not None: p = Preprocessing(con=self.con) oritext = None uniqFeature = [] features = {} originalFeatureCount = 0 progressP = 10 progressS = (70 - progressP) / len(self.dataTraining.index) for index, row in self.dataTraining.iterrows(): text = row[self.text_col] if doPreprocessing: pretext = p.process(text) oritext = pretext['oritext'] pretext = pretext['stemmed_text'] else: pretext = p.processNoPre(text) t = p.processNoPre(pretext).split( " ") # bad performance uniqFeature.extend(t) # bad performance # print("Ori : ",text) # print("Preprocessed : ",pretext," -> ",row[self.class_col]) self.dataTraining.at[index, self.text_col] = pretext progressP += progressS progress.setValue(progressP) # time.sleep(0.5) qc.processEvents() progress.setValue(70) qc.processEvents() uniqFeature = set(uniqFeature) # bad performance qc.processEvents() features['featurebefore'] = len( uniqFeature) # bad performance qc.processEvents() progress.setValue(80) features['vsm'] = self.builtVSM(doFeatureSelection, take_feature, threshold, qc=qc) features['oritext'] = oritext progress.setValue(90) else: print("No training table!") progress.setValue(100) return features
def buttonClick(self): filename = askopenfilename() self.filename = filename print(filename) # do her self.TLabel1.configure(text=filename) self.TLabel1['text'] = filename splittingData = Preprocessing(filename) self.x_train, self.y_train, self.x_test, self.y_test = splittingData.Split( )