def main(): ################################ # # DATA PREPARATION # ################################ # Data cleaning dp = DataPreparation() # X, y data for modeling X, y = dp.clean() variable_names = dp.get_original_variable_names() print print print ' >>>>DATA PREPARATION<<<<' print ' Data preparation | Features: {:s}'.format(variable_names) # Train / test split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=99) print ' Data preparation |' print ' Data preparation | Original sample size: {:d}'.format(X.shape[0]) print ' Data preparation | Training sample size: {:d}'.format( X_train.shape[0]) print ' Data preparation | Test sample size: {:d}'.format(X_test.shape[0]) print ' Data preparation | ' print ' Data preparation | Original death incidence: {:2.3f}'.format( np.mean(y)) print ' Data preparation | Train set death incidence: {:2.3f}'.format( np.mean(y_train)) print ' Data preparation | Test set death incidence: {:2.3f}'.format( np.mean(y_test)) ################################ # # MODEL BUILDING # ################################ # Grid search CV best model print print ' >>>>MODEL SELECTION<<<<' ms = ModelSelector(num_folds=5) scores = ms.grid_search_cv(X_train, y_train) timestmp = datetime.now().strftime('%Y%m%d_%H%M') scores.to_csv('./data/scores_' + timestmp + '.csv', index=False) # Train best models against entire training set # and plot thier ROC curves ms.plot_roc_curves(X_test, y_test) # # Calibrate probabilities # # # Score best model against hold out data set return scores
def main(argv): if argv[1] == 'train_process': get_data() data_preparation = DataPreparation() data_preparation.generate_data_for_model() train_model = Train() train_model.compute_locations_models() prediction = Prediction() prediction.get_models() create_dashboard(prediction)
def main(): data_prep = DataPreparation.default() classifier = Classifier.default() test_files = glob('training_data/vehicles/*/*.png') np.random.shuffle(np.array(test_files)) prepared = data_prep.prepare_images(test_files[0:1000]) results = classifier.predict(prepared) print("Results", results) print("Error", (len(results[results == 0]) / len(results)), "%")
def main(): enable_tracing(False) input_file = "test_videos/project_video.mp4" output_file = 'output_videos/processed_project_video.mp4' processor = VideoProcessor(input_file=input_file, output_file=output_file, classifier=Classifier.rbf(), data_prep=DataPreparation.default()) print("Processing video", input_file, output_file) # processor.process(sub_clip=(12, 15)) # processor.process(sub_clip=(21, 26), frame_divisor=4) # processor.process(sub_clip=(5, 25), frame_divisor=4) processor.process(frame_divisor=4)
def train(epoch): print("#### TRAINING ####") model.train() for filename in os.listdir('../random_split/train/'): #preprocessing raw data df = open_data('../random_split/train/' + filename) df = get_clean_data(df, family_accession_valid) prepare = DataPreparation(df) prepare.encode_sequence() prepare.encode_family_accession() #creating dataloader for training train = data_utils.TensorDataset( torch.from_numpy(prepare.torchable_columns()).long(), torch.Tensor(df.encoded_family_accession.values).long()) train_loader = data_utils.DataLoader(train, batch_size=BATCH_SIZE, shuffle=True) print("Created train loader for file {}".format(filename)) for batch_idx, (x, target) in enumerate(train_loader): x, target = Variable(x).to(device), Variable(target).to(device) h0, c0 = torch.randn(1 * 2, x.shape[0], HIDDEN_SIZE).to(device), torch.randn( 1 * 2, x.shape[0], HIDDEN_SIZE).to(device) optimizer.zero_grad() out = model(x, h0, c0) l = loss_fn(out, target) l.backward() optimizer.step() if batch_idx % 100 == 0: print('batch {} [{}/{}] training loss: {}'.format( batch_idx, batch_idx * len(x), len(train_loader.dataset), l.item())) print("Saving model for {} epoch".format(epoch)) torch.save(model.state_dict(), 'network.pth')
def _instance(linear, rbf): test_data = DataPreparation.default() print("Preparing classifier") classifier = Classifier() classifier._fit(test_data.X_train, test_data.y_train, linear=linear, rbf=rbf) return classifier
def test(epoch): #Creating metrics print("#### EVALUATION #####") model = NN2(NUM_EMBEDDINGS, EMBEDDING_DIM, OUT_CHANNELS1, OUT_CHANNELS2, HIDDEN_SIZE, LINEAR_HIDDEN, NUM_CLASSES) #loading depending if CPU/GPU model.load_state_dict( torch.load('network.pth', map_location={'cuda:0': 'cpu'})) model.to(device) model.eval() total_correct, total_loss, dataset_length = 0, 0, 0 concat_prediction, concat_target = torch.empty(0).cpu(), torch.empty( 0).cpu() for filename in os.listdir('../random_split/dev/'): file_loss, file_correct = 0, 0 #preprocessing raw data df = open_data('../random_split/dev/' + filename) df = get_clean_data(df, family_accession_valid) prepare = DataPreparation(df) prepare.encode_sequence() prepare.encode_family_accession() #creating dataloader for testing test = data_utils.TensorDataset( torch.from_numpy(prepare.torchable_columns()).long(), torch.Tensor(df.encoded_family_accession.values).long()) test_loader = data_utils.DataLoader(test, batch_size=BATCH_SIZE, shuffle=True) dataset_length += len(test_loader.dataset) for batch_idx, (x, target) in enumerate(test_loader): x, target = Variable(x).to(device), Variable(target).to(device) h0, c0 = torch.randn(1 * 2, x.shape[0], HIDDEN_SIZE).to(device), torch.randn( 1 * 2, x.shape[0], HIDDEN_SIZE).to(device) out = model(x, h0, c0) l = loss_fn(out, target) file_loss += l total_loss += l prediction = out.argmax(dim=1, keepdim=True) concat_prediction = torch.cat( (concat_prediction, prediction.cpu()), 0) concat_target = torch.cat((concat_target, target.cpu()), 0) file_correct += prediction.eq( target.view_as(prediction)).sum().item() total_correct += prediction.eq( target.view_as(prediction)).sum().item() taux_classif_file = 100. * file_correct / len(test_loader.dataset) print( 'For file {}, accuracy: {}% -- testing loss {} --- f1-score {}.'. format( filename, taux_classif_file, file_loss, f1_score(concat_prediction, concat_target, average='weighted'))) taux_classif_total = 100. * total_correct / dataset_length print( 'Epoch {} : Total testing accuracy: {}% -- testing loss {} --- f1-score {}' .format(epoch, taux_classif_total, file_loss, f1_score(concat_prediction, concat_target, average='weighted')))
def family_accession_encoder(): #creating family_accession label encoder df = open_data('../raw_clean_data.csv') prepare = DataPreparation(df) prepare.create_label_encoder()
with open(training_files, encoding='utf-8') as f: training_audiopaths_and_text = [line.strip().split("|") for line in f] # if tacotron_params['sort_by_length']: # training_audiopaths_and_text.sort(key=lambda x: len(x[1])) # Read the validation files with open(validation_files, encoding='utf-8') as f: validation_audiopaths_and_text = [ line.strip().split("|") for line in f ] # if tacotron_params['sort_by_length']: # validation_audiopaths_and_text.sort(key=lambda x: len(x[1])) # prepare the data # GST adaptation to put prosody features path as an input argument: train_data = DataPreparation(training_audiopaths_and_text, tacotron_params) validation_data = DataPreparation(validation_audiopaths_and_text, tacotron_params) collate_fn = DataCollate(tacotron_params['number_frames_step']) # DataLoader prepares a loader for a set of data including a function that processes every # batch as we wish (collate_fn). This creates an object with which we can list the batches created. # DataLoader and Dataset (IMPORTANT FOR FURTHER DESIGNS WITH OTHER DATABASES) # https://jdhao.github.io/2017/10/23/pytorch-load-data-and-make-batch/ train_sampler = DistributedSampler( train_data) if tacotron_params['distributed_run'] else None val_sampler = DistributedSampler( validation_data) if tacotron_params['distributed_run'] else None train_loader = DataLoader(train_data,
def main(): """ Checks arguments for validity and starts the training process according to the specified parameters. """ parser = argparse.ArgumentParser( add_help=True, description="This file trains a new neural network on the given dataset.", ) parser.add_argument( "data_dir", help="data directory containing data for training", action="store", type=check_dir_validity, ) parser.add_argument( "--save_dir", action="store", default="./", dest="save_dir", help="directory to save model checkpoints. Expects full path, e.g. /path/to/dir/ without trailing '/'. By default it is stored in the current directory", type=check_dir_validity, ) parser.add_argument( "--arch", action="store", default="vgg13", dest="arch", help="architecture to use as base for model training. Valid values can be found at https://pytorch.org/docs/stable/torchvision/models.html", ) parser.add_argument( "--learning_rate", dest="learning_rate", type=float, default=0.001, action="store", help="learning rate for the optimizer", ) parser.add_argument( "--hidden_units", dest="hidden_units", type=int, default=512, action="store", help="amount of hidden units to use for classifier", ) parser.add_argument( "--epochs", action="store", dest="epochs", default=1, help="amount of training runs", ) parser.add_argument( "--gpu", action="store_true", default=False, dest="gpu", help="enables training on gpu to increase performance", ) args = parser.parse_args() data_preparation = DataPreparation() data_preparation.prepare_training_data(args.data_dir) model_wrapper = ImageModelWrapper() model_wrapper.init_model( args.arch, int(args.hidden_units), float(args.learning_rate) ) train(model_wrapper, data_preparation, int(args.epochs), args.gpu) model_wrapper.save( args.save_dir, int(args.epochs), data_preparation.class_to_idx )
def main(): """ Checks arguments for validity and starts the inference process according to the specified parameters. """ parser = argparse.ArgumentParser( add_help=True, description= "This file performs inference on the passed image and returns the probabilities for the inferred class.", ) parser.add_argument( "image_path", help="path to image on which inference should be done", type=check_file_existence, ) parser.add_argument( "checkpoint_path", help="path to checkpoint containing the model to be used for inference", type=check_file_existence, ) parser.add_argument( "--top_k", dest="top_k", type=int, default=1, action="store", help="amount of classes to return as result of this application", ) parser.add_argument( "--gpu", action="store_true", default=False, dest="gpu", help="enables inference on gpu to increase performance", ) parser.add_argument( "--category_names", dest="category_names_path", type=check_file_existence, default="cat_to_name.json", action="store", help="path to file containing mapping of categories to real names", ) args = parser.parse_args() data_preparation = DataPreparation() image = data_preparation.transform_image(args.image_path) model_wrapper = ImageModelWrapper() model_wrapper.load(args.checkpoint_path) top_p, class_list = predict( image, model_wrapper, args.gpu, args.category_names_path, int(args.top_k), ) for p, name in zip(top_p, class_list): print("Flower is {} with probability {}%".format(name, p * 100))
both_predictions = both_predictions + model_predictions both_predictions = both_predictions / num_of_models both_filename = os.path.join(self.HOME_DIR, "output", "both_test.predict") np.savetxt(both_filename, both_predictions, fmt='%1.10f', delimiter="\n") #======================================================================================= if __name__ == '__main__': dp = DataPreparation() dp.build_combination(processing_mode=0, out_filename="data_all_clarity.csv") dp.clean_data(target_column="clarity") feature_man = FeatureManagement() phase = 2 flags = [False, True] if flags[0]: features = feature_man.get_basic_features(is_clarity=True) + \ feature_man.get_text_features(mode=0, type=0) + \ feature_man.get_text_features(mode=0, type=1) print("Total number of training features {}".format(len(features)))
from data_preparation import DataPreparation from crf_brand_detection import CrfBrandDetector if __name__ == "__main__": print('Data preparing..') prep_df = DataPreparation().features_labels_prep() print('Model fitting...') model = CrfBrandDetector() x_train, x_test, y_train, y_test = model.train_test_split(prep_df) model.fit(x_train, y_train) model.print_classification_report(x_test, y_test) print('Accuracy for whole titles: {}'.format(model.evaluate( x_test, y_test))) pred = model.predict(x_test) pred.to_csv('helper_files/predictions.csv')
fig, axes = plt.subplots(2, figsize=(15,12),sharex=True) axes[0].plot(true, label='true',color='b',alpha=0.75) axes[0].set_ylabel('energy demand kwh') axes[0].legend(loc='best') axes[1].plot(evaluation_loss[1], label='evaluation loss',color='r',alpha=0.75) axes[1].plot(evaluation_loss[0]*np.ones(len(true)),'--',label='average_loss',color='r',alpha=0.75) axes[1].set_ylabel(loss) axes[1].legend(loc='best') plt.show() pass if __name__ =='__main__': print 'getting and preparing input data...' window = 48 freq = '30T' dp = DataPreparation('~/git_hub/capstone_data/Azimuth/clean/project_6d8c_featurized.csv', 'energy_all',freq) df = dp.read_data() # resample_dict = defaultdict(list) # resample_dict['sum'] = ['energy_all','liq_precip'] # resample_dict['mean'] = ['T','irr_glo'] resample_dict = {'sum':['energy_all','liq_precip'], 'mean':['T','irr_glo']} df = dp.resample_columns(df, resample_dict) df = dp.create_time_features(df) df_X, df_y = dp.prepare_data_multistep(df,window) X_train, X_test, y_train, y_test = dp.train_test_split(df_X.values, df_y.values, 0.1) print 'LSTM model training' units = np.arange(30,111,20) sequences = np.array([i * window for i in [1,3,5]]) dropout = [0.0,0.2] activations = ['relu','tanh']
from data_preparation import DataPreparation from lstm_brand_detection import LstmBrandDetector if __name__ == "__main__": prep = DataPreparation(titles_filepath='helper_files/train.csv') print('Loading word embeddings...') prep.load_glove('helper_files/glove.txt') print('Reading data...') prep.read_data() print('Data preparing..') x, y = prep.prepare_data() x_train, x_test, y_train, y_test, test_df = prep.train_test_split(x, y) model = LstmBrandDetector() print('Model fitting...') model.create_model() model.fit(x_train, y_train, epochs=8) print('Accuracy for whole titles: {}'.format(model.evaluate( x_test, y_test))) preds = model.predict(x_test, test_df) preds.to_csv('helper_files/predictions.csv')
if target_column == "conciseness": output_filename = os.path.join(self.HOME_DIR, "output", "conciseness_valid.predict") else: output_filename = os.path.join(self.HOME_DIR, "output", "clarity_valid.predict") np.savetxt(output_filename, df_test["predictions_proba"], fmt='%1.10f', delimiter="\n") #======================================================================================= if __name__ == '__main__': dp = DataPreparation() dp.build_combination(processing_mode=1, out_filename="data_all_conciseness.csv") dp.clean_data(target_column="conciseness") feature_man = FeatureManagement() phase = 1 flags = [True, False] if flags[0]: features = feature_man.get_basic_features() + \ feature_man.get_text_features(mode=0, type=0) + \ feature_man.get_text_features(mode=0, type=1) print("Total number of training features {}".format(len(features))) print(feature_man.get_basic_features())
if __name__ == '__main__': # MODE: # 0 = take percentage of all data, then split it into train (80%) / test (20%) # 1 = split data into train / test first (use the same 20% of ALL data as test set for all training sets) mode = 1 # select dataset sizes (up to 1.0) and algorithms (all options: 'logReg', 'svm', 'dt', 'rf', 'ann') selectedSizes = [0.2, 0.4, 0.6] selectedAlgorithms = ['svm'] # set number of repetitions and their respective random generator seeds randomSeeds = [20] # create new directory for results of this run # name of the folder can be passed as param (default name is timestamp) dirName = createDir() # init dicts to hold data dataInfo = {} fullTrain = {} fullTest = {} trainScores = {'Samples': []} # scores for plotting testScores = {'Samples': []} # scores for plotting # clean and preprocess data dp = DataPreparation('../data/mainSimulationAccessTraces.csv') dp.prepareData() # run predictions main()
# Read the training files with open(training_files, encoding='utf-8') as f: training_audiopaths_and_text = [line.strip().split("|") for line in f] # if tacotron_params['sort_by_length']: # training_audiopaths_and_text.sort(key=lambda x: len(x[1])) # Read the validation files with open(validation_files, encoding='utf-8') as f: validation_audiopaths_and_text = [line.strip().split("|") for line in f] # if tacotron_params['sort_by_length']: # validation_audiopaths_and_text.sort(key=lambda x: len(x[1])) # prepare the data # GST adaptation to put prosody features path as an input argument: train_data = DataPreparation(training_audiopaths_and_text, training_prosody_features_path, tacotron_params) validation_data = DataPreparation(validation_audiopaths_and_text, validation_prosody_features_path, tacotron_params) collate_fn = DataCollate(tacotron_params['number_frames_step']) # DataLoader prepares a loader for a set of data including a function that processes every # batch as we wish (collate_fn). This creates an object with which we can list the batches created. # DataLoader and Dataset (IMPORTANT FOR FURTHER DESIGNS WITH OTHER DATABASES) # https://jdhao.github.io/2017/10/23/pytorch-load-data-and-make-batch/ train_sampler = DistributedSampler(train_data) if tacotron_params['distributed_run'] else None val_sampler = DistributedSampler(validation_data) if tacotron_params['distributed_run'] else None train_loader = DataLoader(train_data, num_workers=1, shuffle=False, sampler=train_sampler, batch_size=tacotron_params['batch_size'], pin_memory=False, drop_last=True, collate_fn=collate_fn)
if test_prediction == 1: xbox_left = np.int(xleft * scale) ytop_draw = np.int(ytop * scale) win_draw = np.int(window * scale) boxes.append(((xbox_left + xstart, ytop_draw + ystart), (xbox_left + win_draw + xstart, ytop_draw + win_draw + ystart))) return boxes if __name__ == '__main__': # get attributes of our svc object classifier = Classifier.rbf() data_prep = DataPreparation.default() svc = classifier.svc X_scaler = data_prep.scaler orient = data_prep.hog_config.orient pix_per_cell = data_prep.hog_config.pix_per_cell cell_per_block = data_prep.hog_config.cell_per_block colorspace = data_prep.hog_config.colorspace hog_channels = data_prep.hog_config.hog_channels spatial_size = data_prep.hog_config.spatial_size hist_bins = data_prep.hog_config.hist_bins for file in glob('test_images/vlcsnap-2018-07-13-22h34m59s164.png'): img = mpimg.imread(file) # search_grid = [ # (400, 480, 300, 980, .75),
def main(): classifier = Classifier.default() test_data = DataPreparation.default() report = classifier.report(test_data.X_test, test_data.y_test) print("Classifier score:", report['score'], "params:", report["params"])