Python DataPreparation 예제들, data_preparation.DataPreparation Python 예제들

예제 #1

0

파일 보기

def main():
    ################################
    #
    #   DATA PREPARATION
    #
    ################################
    # Data cleaning
    dp = DataPreparation()

    # X, y data for modeling
    X, y = dp.clean()
    variable_names = dp.get_original_variable_names()
    print
    print
    print ' >>>>DATA PREPARATION<<<<'
    print ' Data preparation | Features: {:s}'.format(variable_names)

    # Train / test split
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=99)
    print ' Data preparation |'
    print ' Data preparation | Original sample size: {:d}'.format(X.shape[0])
    print ' Data preparation | Training sample size: {:d}'.format(
        X_train.shape[0])
    print ' Data preparation | Test sample size: {:d}'.format(X_test.shape[0])
    print ' Data preparation | '
    print ' Data preparation | Original death incidence: {:2.3f}'.format(
        np.mean(y))
    print ' Data preparation | Train set death incidence: {:2.3f}'.format(
        np.mean(y_train))
    print ' Data preparation | Test set death incidence: {:2.3f}'.format(
        np.mean(y_test))

    ################################
    #
    #   MODEL BUILDING
    #
    ################################
    # Grid search CV best model
    print
    print ' >>>>MODEL SELECTION<<<<'
    ms = ModelSelector(num_folds=5)
    scores = ms.grid_search_cv(X_train, y_train)

    timestmp = datetime.now().strftime('%Y%m%d_%H%M')
    scores.to_csv('./data/scores_' + timestmp + '.csv', index=False)

    # Train best models against entire training set
    # and plot thier ROC curves
    ms.plot_roc_curves(X_test, y_test)

    # # Calibrate probabilities
    #
    # # Score best model against hold out data set
    return scores

예제 #2

0

파일 보기

def main(argv):
    if argv[1] == 'train_process':
        get_data()
        data_preparation = DataPreparation()
        data_preparation.generate_data_for_model()
        train_model = Train()
        train_model.compute_locations_models()
        prediction = Prediction()
        prediction.get_models()
        create_dashboard(prediction)

예제 #3

0

파일 보기

파일: classifier_test.py 프로젝트: ivovandongen/CarND-Vehicle-Detection

def main():
    data_prep = DataPreparation.default()
    classifier = Classifier.default()
    test_files = glob('training_data/vehicles/*/*.png')
    np.random.shuffle(np.array(test_files))
    prepared = data_prep.prepare_images(test_files[0:1000])
    results = classifier.predict(prepared)
    print("Results", results)
    print("Error", (len(results[results == 0]) / len(results)), "%")

예제 #4

0

파일 보기

파일: video_processor.py 프로젝트: ivovandongen/CarND-Vehicle-Detection

def main():
    enable_tracing(False)
    input_file = "test_videos/project_video.mp4"
    output_file = 'output_videos/processed_project_video.mp4'
    processor = VideoProcessor(input_file=input_file,
                               output_file=output_file,
                               classifier=Classifier.rbf(),
                               data_prep=DataPreparation.default())

    print("Processing video", input_file, output_file)
    # processor.process(sub_clip=(12, 15))
    # processor.process(sub_clip=(21, 26), frame_divisor=4)
    # processor.process(sub_clip=(5, 25), frame_divisor=4)
    processor.process(frame_divisor=4)

예제 #5

0

파일 보기

파일: main.py 프로젝트: thibmonsel/PfamClassification

def train(epoch):
    print("#### TRAINING ####")
    model.train()

    for filename in os.listdir('../random_split/train/'):

        #preprocessing raw data
        df = open_data('../random_split/train/' + filename)
        df = get_clean_data(df, family_accession_valid)
        prepare = DataPreparation(df)
        prepare.encode_sequence()
        prepare.encode_family_accession()

        #creating dataloader for training
        train = data_utils.TensorDataset(
            torch.from_numpy(prepare.torchable_columns()).long(),
            torch.Tensor(df.encoded_family_accession.values).long())
        train_loader = data_utils.DataLoader(train,
                                             batch_size=BATCH_SIZE,
                                             shuffle=True)
        print("Created train loader for file {}".format(filename))

        for batch_idx, (x, target) in enumerate(train_loader):
            x, target = Variable(x).to(device), Variable(target).to(device)
            h0, c0 = torch.randn(1 * 2, x.shape[0],
                                 HIDDEN_SIZE).to(device), torch.randn(
                                     1 * 2, x.shape[0], HIDDEN_SIZE).to(device)
            optimizer.zero_grad()
            out = model(x, h0, c0)
            l = loss_fn(out, target)
            l.backward()
            optimizer.step()
            if batch_idx % 100 == 0:
                print('batch {} [{}/{}] training loss: {}'.format(
                    batch_idx, batch_idx * len(x), len(train_loader.dataset),
                    l.item()))
    print("Saving model for {} epoch".format(epoch))
    torch.save(model.state_dict(), 'network.pth')

예제 #6

0

파일 보기

 def _instance(linear, rbf):
     test_data = DataPreparation.default()
     print("Preparing classifier")
     classifier = Classifier()
     classifier._fit(test_data.X_train, test_data.y_train, linear=linear, rbf=rbf)
     return classifier

예제 #7

0

파일 보기

파일: main.py 프로젝트: thibmonsel/PfamClassification

def test(epoch):
    #Creating metrics
    print("#### EVALUATION #####")
    model = NN2(NUM_EMBEDDINGS, EMBEDDING_DIM, OUT_CHANNELS1, OUT_CHANNELS2,
                HIDDEN_SIZE, LINEAR_HIDDEN, NUM_CLASSES)
    #loading depending if CPU/GPU
    model.load_state_dict(
        torch.load('network.pth', map_location={'cuda:0': 'cpu'}))
    model.to(device)
    model.eval()
    total_correct, total_loss, dataset_length = 0, 0, 0
    concat_prediction, concat_target = torch.empty(0).cpu(), torch.empty(
        0).cpu()
    for filename in os.listdir('../random_split/dev/'):
        file_loss, file_correct = 0, 0
        #preprocessing raw data
        df = open_data('../random_split/dev/' + filename)
        df = get_clean_data(df, family_accession_valid)
        prepare = DataPreparation(df)
        prepare.encode_sequence()
        prepare.encode_family_accession()

        #creating dataloader for testing
        test = data_utils.TensorDataset(
            torch.from_numpy(prepare.torchable_columns()).long(),
            torch.Tensor(df.encoded_family_accession.values).long())
        test_loader = data_utils.DataLoader(test,
                                            batch_size=BATCH_SIZE,
                                            shuffle=True)
        dataset_length += len(test_loader.dataset)
        for batch_idx, (x, target) in enumerate(test_loader):
            x, target = Variable(x).to(device), Variable(target).to(device)
            h0, c0 = torch.randn(1 * 2, x.shape[0],
                                 HIDDEN_SIZE).to(device), torch.randn(
                                     1 * 2, x.shape[0], HIDDEN_SIZE).to(device)
            out = model(x, h0, c0)
            l = loss_fn(out, target)
            file_loss += l
            total_loss += l
            prediction = out.argmax(dim=1, keepdim=True)
            concat_prediction = torch.cat(
                (concat_prediction, prediction.cpu()), 0)
            concat_target = torch.cat((concat_target, target.cpu()), 0)
            file_correct += prediction.eq(
                target.view_as(prediction)).sum().item()
            total_correct += prediction.eq(
                target.view_as(prediction)).sum().item()

        taux_classif_file = 100. * file_correct / len(test_loader.dataset)
        print(
            'For file {}, accuracy: {}%  -- testing loss {} --- f1-score {}.'.
            format(
                filename, taux_classif_file, file_loss,
                f1_score(concat_prediction, concat_target,
                         average='weighted')))
    taux_classif_total = 100. * total_correct / dataset_length
    print(
        'Epoch {} : Total testing accuracy: {}%  -- testing loss {} --- f1-score {}'
        .format(epoch, taux_classif_total, file_loss,
                f1_score(concat_prediction, concat_target,
                         average='weighted')))

예제 #8

0

파일 보기

파일: main.py 프로젝트: thibmonsel/PfamClassification

def family_accession_encoder():
    #creating family_accession label encoder
    df = open_data('../raw_clean_data.csv')
    prepare = DataPreparation(df)
    prepare.create_label_encoder()

예제 #9

0

파일 보기

파일: MAIN.py 프로젝트: chenchy/GST_Tacotron2

    with open(training_files, encoding='utf-8') as f:
        training_audiopaths_and_text = [line.strip().split("|") for line in f]
    # if tacotron_params['sort_by_length']:
    #    training_audiopaths_and_text.sort(key=lambda x: len(x[1]))

    # Read the validation files
    with open(validation_files, encoding='utf-8') as f:
        validation_audiopaths_and_text = [
            line.strip().split("|") for line in f
        ]
    # if tacotron_params['sort_by_length']:
    #    validation_audiopaths_and_text.sort(key=lambda x: len(x[1]))

    # prepare the data
    # GST adaptation to put prosody features path as an input argument:
    train_data = DataPreparation(training_audiopaths_and_text, tacotron_params)
    validation_data = DataPreparation(validation_audiopaths_and_text,
                                      tacotron_params)
    collate_fn = DataCollate(tacotron_params['number_frames_step'])

    # DataLoader prepares a loader for a set of data including a function that processes every
    # batch as we wish (collate_fn). This creates an object with which we can list the batches created.
    # DataLoader and Dataset (IMPORTANT FOR FURTHER DESIGNS WITH OTHER DATABASES)
    # https://jdhao.github.io/2017/10/23/pytorch-load-data-and-make-batch/

    train_sampler = DistributedSampler(
        train_data) if tacotron_params['distributed_run'] else None
    val_sampler = DistributedSampler(
        validation_data) if tacotron_params['distributed_run'] else None

    train_loader = DataLoader(train_data,

예제 #10

0

파일 보기

def main():
    """
    Checks arguments for validity and starts the training process according to
    the specified parameters.
    """

    parser = argparse.ArgumentParser(
        add_help=True,
        description="This file trains a new neural network on the given dataset.",
    )
    parser.add_argument(
        "data_dir",
        help="data directory containing data for training",
        action="store",
        type=check_dir_validity,
    )
    parser.add_argument(
        "--save_dir",
        action="store",
        default="./",
        dest="save_dir",
        help="directory to save model checkpoints. Expects full path, e.g. /path/to/dir/ without trailing '/'. By default it is stored in the current directory",
        type=check_dir_validity,
    )
    parser.add_argument(
        "--arch",
        action="store",
        default="vgg13",
        dest="arch",
        help="architecture to use as base for model training. Valid values can be found at https://pytorch.org/docs/stable/torchvision/models.html",
    )
    parser.add_argument(
        "--learning_rate",
        dest="learning_rate",
        type=float,
        default=0.001,
        action="store",
        help="learning rate for the optimizer",
    )
    parser.add_argument(
        "--hidden_units",
        dest="hidden_units",
        type=int,
        default=512,
        action="store",
        help="amount of hidden units to use for classifier",
    )
    parser.add_argument(
        "--epochs",
        action="store",
        dest="epochs",
        default=1,
        help="amount of training runs",
    )
    parser.add_argument(
        "--gpu",
        action="store_true",
        default=False,
        dest="gpu",
        help="enables training on gpu to increase performance",
    )

    args = parser.parse_args()

    data_preparation = DataPreparation()
    data_preparation.prepare_training_data(args.data_dir)
    model_wrapper = ImageModelWrapper()
    model_wrapper.init_model(
        args.arch, int(args.hidden_units), float(args.learning_rate)
    )

    train(model_wrapper, data_preparation, int(args.epochs), args.gpu)

    model_wrapper.save(
        args.save_dir, int(args.epochs), data_preparation.class_to_idx
    )

예제 #11

0

파일 보기

def main():
    """
    Checks arguments for validity and starts the inference process according
    to the specified parameters.
    """

    parser = argparse.ArgumentParser(
        add_help=True,
        description=
        "This file performs inference on the passed image and returns the probabilities for the inferred class.",
    )
    parser.add_argument(
        "image_path",
        help="path to image on which inference should be done",
        type=check_file_existence,
    )
    parser.add_argument(
        "checkpoint_path",
        help="path to checkpoint containing the model to be used for inference",
        type=check_file_existence,
    )
    parser.add_argument(
        "--top_k",
        dest="top_k",
        type=int,
        default=1,
        action="store",
        help="amount of classes to return as result of this application",
    )
    parser.add_argument(
        "--gpu",
        action="store_true",
        default=False,
        dest="gpu",
        help="enables inference on gpu to increase performance",
    )
    parser.add_argument(
        "--category_names",
        dest="category_names_path",
        type=check_file_existence,
        default="cat_to_name.json",
        action="store",
        help="path to file containing mapping of categories to real names",
    )

    args = parser.parse_args()

    data_preparation = DataPreparation()
    image = data_preparation.transform_image(args.image_path)

    model_wrapper = ImageModelWrapper()
    model_wrapper.load(args.checkpoint_path)

    top_p, class_list = predict(
        image,
        model_wrapper,
        args.gpu,
        args.category_names_path,
        int(args.top_k),
    )

    for p, name in zip(top_p, class_list):
        print("Flower is {} with probability {}%".format(name, p * 100))

예제 #12

0

파일 보기

파일: xgboost_algorithm.py 프로젝트: hungtrungthinh/cikm17_cup_lazada_product_title

                both_predictions = both_predictions + model_predictions

            both_predictions = both_predictions / num_of_models

        both_filename = os.path.join(self.HOME_DIR, "output",
                                     "both_test.predict")
        np.savetxt(both_filename,
                   both_predictions,
                   fmt='%1.10f',
                   delimiter="\n")


#=======================================================================================
if __name__ == '__main__':

    dp = DataPreparation()
    dp.build_combination(processing_mode=0,
                         out_filename="data_all_clarity.csv")
    dp.clean_data(target_column="clarity")

    feature_man = FeatureManagement()
    phase = 2
    flags = [False, True]

    if flags[0]:

        features = feature_man.get_basic_features(is_clarity=True) + \
                   feature_man.get_text_features(mode=0, type=0) + \
                   feature_man.get_text_features(mode=0, type=1)
        print("Total number of training features {}".format(len(features)))

예제 #13

0

파일 보기

파일: training.py 프로젝트: yakzan/brand-detection

from data_preparation import DataPreparation
from crf_brand_detection import CrfBrandDetector

if __name__ == "__main__":
    print('Data preparing..')
    prep_df = DataPreparation().features_labels_prep()
    print('Model fitting...')
    model = CrfBrandDetector()
    x_train, x_test, y_train, y_test = model.train_test_split(prep_df)
    model.fit(x_train, y_train)
    model.print_classification_report(x_test, y_test)
    print('Accuracy for whole titles: {}'.format(model.evaluate(
        x_test, y_test)))
    pred = model.predict(x_test)
    pred.to_csv('helper_files/predictions.csv')

예제 #14

0

파일 보기

    fig, axes = plt.subplots(2, figsize=(15,12),sharex=True)
    axes[0].plot(true, label='true',color='b',alpha=0.75)
    axes[0].set_ylabel('energy demand kwh')
    axes[0].legend(loc='best')
    axes[1].plot(evaluation_loss[1], label='evaluation loss',color='r',alpha=0.75)
    axes[1].plot(evaluation_loss[0]*np.ones(len(true)),'--',label='average_loss',color='r',alpha=0.75)
    axes[1].set_ylabel(loss)
    axes[1].legend(loc='best')
    plt.show()
    pass

if __name__ =='__main__':
    print 'getting and preparing input data...'
    window = 48
    freq = '30T'
    dp = DataPreparation('~/git_hub/capstone_data/Azimuth/clean/project_6d8c_featurized.csv', 'energy_all',freq)
    df = dp.read_data()
    # resample_dict = defaultdict(list)
    # resample_dict['sum'] = ['energy_all','liq_precip']
    # resample_dict['mean'] = ['T','irr_glo']
    resample_dict = {'sum':['energy_all','liq_precip'], 'mean':['T','irr_glo']}
    df = dp.resample_columns(df, resample_dict)
    df = dp.create_time_features(df)
    df_X, df_y  = dp.prepare_data_multistep(df,window)
    X_train, X_test, y_train, y_test = dp.train_test_split(df_X.values, df_y.values, 0.1)

    print 'LSTM model training'
    units = np.arange(30,111,20)
    sequences = np.array([i * window for i in [1,3,5]])
    dropout = [0.0,0.2]
    activations = ['relu','tanh']

예제 #15

0

파일 보기

파일: training.py 프로젝트: yakzan/brand-detection

from data_preparation import DataPreparation
from lstm_brand_detection import LstmBrandDetector

if __name__ == "__main__":
    prep = DataPreparation(titles_filepath='helper_files/train.csv')
    print('Loading word embeddings...')
    prep.load_glove('helper_files/glove.txt')
    print('Reading data...')
    prep.read_data()
    print('Data preparing..')
    x, y = prep.prepare_data()
    x_train, x_test, y_train, y_test, test_df = prep.train_test_split(x, y)
    model = LstmBrandDetector()
    print('Model fitting...')
    model.create_model()
    model.fit(x_train, y_train, epochs=8)
    print('Accuracy for whole titles: {}'.format(model.evaluate(
        x_test, y_test)))
    preds = model.predict(x_test, test_df)
    preds.to_csv('helper_files/predictions.csv')

예제 #16

0

파일 보기

        if target_column == "conciseness":
            output_filename = os.path.join(self.HOME_DIR, "output",
                                           "conciseness_valid.predict")
        else:
            output_filename = os.path.join(self.HOME_DIR, "output",
                                           "clarity_valid.predict")
        np.savetxt(output_filename,
                   df_test["predictions_proba"],
                   fmt='%1.10f',
                   delimiter="\n")


#=======================================================================================
if __name__ == '__main__':

    dp = DataPreparation()
    dp.build_combination(processing_mode=1,
                         out_filename="data_all_conciseness.csv")
    dp.clean_data(target_column="conciseness")
    feature_man = FeatureManagement()

    phase = 1
    flags = [True, False]

    if flags[0]:
        features = feature_man.get_basic_features() + \
                   feature_man.get_text_features(mode=0, type=0) + \
                   feature_man.get_text_features(mode=0, type=1)
        print("Total number of training features {}".format(len(features)))
        print(feature_man.get_basic_features())

예제 #17

0

파일 보기

if __name__ == '__main__':
    # MODE:
    # 0 = take percentage of all data, then split it into train (80%) / test (20%)
    # 1 = split data into train / test first (use the same 20% of ALL data as test set for all training sets)
    mode = 1

    # select dataset sizes (up to 1.0) and algorithms (all options: 'logReg', 'svm', 'dt', 'rf', 'ann')
    selectedSizes = [0.2, 0.4, 0.6]
    selectedAlgorithms = ['svm']

    # set number of repetitions and their respective random generator seeds
    randomSeeds = [20]

    # create new directory for results of this run
    # name of the folder can be passed as param (default name is timestamp)
    dirName = createDir()

    # init dicts to hold data
    dataInfo = {}
    fullTrain = {}
    fullTest = {}
    trainScores = {'Samples': []}  # scores for plotting
    testScores = {'Samples': []}  # scores for plotting

    # clean and preprocess data
    dp = DataPreparation('../data/mainSimulationAccessTraces.csv')
    dp.prepareData()

    # run predictions
    main()

예제 #18

0

파일 보기

파일: MAIN.py 프로젝트: chenchy/GST_Tacotron2_PitchContourReference

    # Read the training files
    with open(training_files, encoding='utf-8') as f:
        training_audiopaths_and_text = [line.strip().split("|") for line in f]
    # if tacotron_params['sort_by_length']:
    #    training_audiopaths_and_text.sort(key=lambda x: len(x[1]))

    # Read the validation files
    with open(validation_files, encoding='utf-8') as f:
        validation_audiopaths_and_text = [line.strip().split("|") for line in f]
    # if tacotron_params['sort_by_length']:
    #    validation_audiopaths_and_text.sort(key=lambda x: len(x[1]))

    # prepare the data
    # GST adaptation to put prosody features path as an input argument:
    train_data = DataPreparation(training_audiopaths_and_text, training_prosody_features_path, tacotron_params)
    validation_data = DataPreparation(validation_audiopaths_and_text, validation_prosody_features_path, tacotron_params)
    collate_fn = DataCollate(tacotron_params['number_frames_step'])

    # DataLoader prepares a loader for a set of data including a function that processes every
    # batch as we wish (collate_fn). This creates an object with which we can list the batches created.
    # DataLoader and Dataset (IMPORTANT FOR FURTHER DESIGNS WITH OTHER DATABASES)
    # https://jdhao.github.io/2017/10/23/pytorch-load-data-and-make-batch/

    train_sampler = DistributedSampler(train_data) if tacotron_params['distributed_run'] else None
    val_sampler = DistributedSampler(validation_data) if tacotron_params['distributed_run'] else None

    train_loader = DataLoader(train_data, num_workers=1, shuffle=False, sampler=train_sampler,
                              batch_size=tacotron_params['batch_size'], pin_memory=False, drop_last=True,
                              collate_fn=collate_fn)

예제 #19

0

파일 보기

            if test_prediction == 1:
                xbox_left = np.int(xleft * scale)
                ytop_draw = np.int(ytop * scale)
                win_draw = np.int(window * scale)
                boxes.append(((xbox_left + xstart, ytop_draw + ystart),
                              (xbox_left + win_draw + xstart,
                               ytop_draw + win_draw + ystart)))

    return boxes


if __name__ == '__main__':

    # get attributes of our svc object
    classifier = Classifier.rbf()
    data_prep = DataPreparation.default()
    svc = classifier.svc
    X_scaler = data_prep.scaler
    orient = data_prep.hog_config.orient
    pix_per_cell = data_prep.hog_config.pix_per_cell
    cell_per_block = data_prep.hog_config.cell_per_block
    colorspace = data_prep.hog_config.colorspace
    hog_channels = data_prep.hog_config.hog_channels
    spatial_size = data_prep.hog_config.spatial_size
    hist_bins = data_prep.hog_config.hist_bins

    for file in glob('test_images/vlcsnap-2018-07-13-22h34m59s164.png'):
        img = mpimg.imread(file)

        # search_grid = [
        #     (400, 480, 300, 980, .75),

예제 #20

0

파일 보기

def main():
    classifier = Classifier.default()
    test_data = DataPreparation.default()
    report = classifier.report(test_data.X_test, test_data.y_test)
    print("Classifier score:", report['score'], "params:", report["params"])