Exemplos de DataPreprocess em Python, exemplos de data_preprocessing.DataPreprocess em Python

Exemplo n.º 1

0

Exibir arquivo

 def test_punctunation(self):
     config = templates.pipeline()
     config['data_loader'] = templates.data_loader_single_item_loader()
     config['steps'].append(templates.normalize_text_remove_punctuation())
     loader = DataPreprocess(config)
     process = loader.process_item(TEST_DATA)
     self.assertEqual("This isn t a TEST sentences ", process["data"])

Exemplo n.º 2

0

Exibir arquivo

 def test_whitespace(self):
     config = templates.pipeline()
     config['data_loader'] = templates.data_loader_single_item_loader()
     config['steps'].append(templates.normalize_text_remove_whitespace())
     loader = DataPreprocess(config)
     process = loader.process_item(" remove  whitespace ")
     self.assertEqual("remove whitespace", process["data"])

Exemplo n.º 3

0

Exibir arquivo

 def test_lower(self):
     config = templates.pipeline()
     config['data_loader'] = templates.data_loader_single_item_loader()
     config['steps'].append(templates.normalize_text_lowercase())
     loader = DataPreprocess(config)
     process = loader.process_item(TEST_DATA)
     self.assertEqual(TEST_DATA.lower(), process["data"])

Exemplo n.º 4

0

Exibir arquivo

Arquivo: test_data_loaders.py Projeto: denrun-p/data_preprocessing

    def test_csv_pipeline(self):
        config = {
            "data_loader": {
                "name": "data_loader",
                "type": "csv",
                "file_path": "test_data/test.csv",
                "columns": {
                    "id": "id",
                    "data": "text",
                    "additional_columns": ["username"]
                },
            },
            "steps": [
                {
                    "name": "normalize_text",
                    "type": "lowercase",
                    "log_level": "INFO"
                },
            ],
        }
        loader = DataPreprocess(config)
        data = []
        for batch in loader.process_data():
            for item in batch:
                data.append(item["data"])

        self.assertEqual(4, len(data))

Exemplo n.º 5

0

Exibir arquivo

 def test_url(self):
     config = templates.pipeline()
     config['data_loader'] = templates.data_loader_single_item_loader()
     config['steps'].append(templates.normalize_text_remove_urls())
     loader = DataPreprocess(config)
     process = loader.process_item("remove url http://www.google.com")
     self.assertEqual("remove url ", process["data"])

Exemplo n.º 6

0

Exibir arquivo

 def test_stopwords(self):
     config = templates.pipeline()
     config['data_loader'] = templates.data_loader_single_item_loader()
     config['steps'].append(templates.normalize_text_lowercase())
     config['steps'].append(templates.normalize_text_remove_stopwords())
     loader = DataPreprocess(config)
     process = loader.process_item(TEST_DATA)
     self.assertEqual("isn't test sentences!", process["data"])

Exemplo n.º 7

0

Exibir arquivo

 def test_lemmatizer(self):
     config = templates.pipeline()
     config['data_loader'] = templates.data_loader_single_item_loader()
     config['steps'].append(templates.normalize_text_lowercase())
     config['steps'].append(templates.normalize_text_lemmatizer())
     loader = DataPreprocess(config)
     process = loader.process_item("How many cities are there?")
     self.assertEqual("how many city are there?", process["data"])

Exemplo n.º 8

0

Exibir arquivo

 def test_contractions(self):
     config = templates.pipeline()
     config['data_loader'] = templates.data_loader_single_item_loader()
     config['steps'].append(templates.normalize_text_lowercase())
     config['steps'].append(templates.normalize_text_expand_contractions())
     loader = DataPreprocess(config)
     process = loader.process_item(TEST_DATA)
     self.assertEqual("this is not a test sentences!", process["data"])

Exemplo n.º 9

0

Exibir arquivo

Arquivo: classifier.py Projeto: nitinmlvya/Text-Tagging

    def __init__(self):
        parser = argparse.ArgumentParser(description='List the content of a folder')
        parser.add_argument('--text_file', type=str, help='File path to classify')
        args = parser.parse_args()
        self.text_file_path = args.text_file
        self.df = None
        self.X = None

        self.word_to_vector_model_path = r'models/w2v.pkl'
        self.dim_reduction_path = r'models/dim_reduction.pkl'
        self.model_path = r'models/svc_model.pkl'
        self.data_preprocessing = DataPreprocess()
        self.build_features = BuildFeatures()

Exemplo n.º 10

0

Exibir arquivo

Arquivo: combo_no_label.py Projeto: voxlogic/presence_detection_cnn

def main():
    ##################################################
    # parse data from original data & construct images
    ##################################################
    print(
        "parsing data from log files which are generated by Atheros-CSI-TOOL\n"
    )
    data_generator = DataLogParser(conf.n_timestamps, conf.D, conf.step_size,
                                   conf.ntx_max, conf.nrx_max,
                                   conf.nsubcarrier_max, conf.data_folder,
                                   conf.log_folder, conf.skip_frames,
                                   conf.time_offset_ratio, conf.day_conf,
                                   conf.label)
    data_generator.generate_image_no_label(conf.draw_date, conf.draw_label)
    # train_data, test_data: classes (key: label, value: images under this label)
    test_data = data_generator.get_data_no_label()

    ##################################################
    # apply signal processing blocks to images
    ##################################################
    print("Pre-processing data\n")
    data_process = DataPreprocess(conf.n_timestamps, conf.D, conf.step_size,
                                  conf.ntx_max, conf.ntx, conf.nrx_max,
                                  conf.nrx, conf.nsubcarrier_max,
                                  conf.nsubcarrier, conf.data_shape_to_nn,
                                  conf.data_folder, conf.label)
    data_process.add_image_no_label(test_data)
    data_process.signal_processing(conf.do_fft, conf.fft_shape)
    data_process.prepare_shape()
    final_test_data = data_process.get_data_no_label()

    ##################################################
    # train or test data with neural netowrk
    ##################################################

    nn_model = NeuralNetworkModel(conf.data_shape_to_nn, conf.abs_shape_to_nn,
                                  conf.phase_shape_to_nn, conf.total_classes)
    print("Get test result using existing model (in test mode)\n")
    nn_model.load_model(conf.model_name)
    for key in final_test_data:
        plt.figure()
        total_test = len(final_test_data[key])
        cc = 1
        for idx in final_test_data[key]:
            # if want to output motion probability, please set output_label == False
            result = nn_model.get_no_label_result(final_test_data[key][idx],
                                                  output_label=True)
            plt.subplot(total_test, 1, cc)
            plt.plot(result)
            plt.title(idx)
            plt.ylim(0, 1.05)
            cc = cc + 1
        plt.suptitle(key)
    nn_model.end()
    plt.show()
    print("Done!")

Exemplo n.º 11

0

Exibir arquivo

Arquivo: test_data_loaders.py Projeto: denrun-p/data_preprocessing

 def test_single_item_pipeline(self):
     config = {
         "data_loader": {
             "type": "single_item"
         },
         "steps": [
             {
                 "name": "normalize_text",
                 "type": "lowercase",
                 "log_level": "INFO"
             },
         ],
     }
     loader = DataPreprocess(config)
     process = loader.process_item(TEST_DATA)
     self.assertEqual(TEST_DATA.lower(), process["data"])

Exemplo n.º 12

0

Exibir arquivo

Arquivo: tcn_model.py Projeto: MrGuo2/MyProject

 def generate_evaluate(self, file_path):
     while True:
         for list_x, list_y, list_time_x, list_time_y in self.get_feature(
                 file_path):
             list_x = DataPreprocess.add_feature(
                 file_path,
                 list_time_x) if self.tcn_add_features else list_x
             yield np.array(list_x), np.array(list_y)

Exemplo n.º 13

0

Exibir arquivo

Arquivo: model_evaluation.py Projeto: amitjoy/Comments-Toxicity-Detection

def execute():
    test_data = prepare_test_data()
    preprocessing = DataPreprocess(test_data, do_load_existing_tokenizer=True)
    prediction = make_prediction(preprocessing)
    roc_auc = evaluate_roc_auc(preprocessing, prediction > 0.5)
    accuracy = evaluate_accuracy_score(preprocessing, prediction > 0.5)

    print(f'Average ROC_AUC Score on Test Data: {roc_auc}')
    print(f'Average Accuracy Score on Test Data: {accuracy}')

Exemplo n.º 14

0

Exibir arquivo

Arquivo: model_training.py Projeto: amitjoy/Comments-Toxicity-Detection

def execute():
    # Import the training data csv file and save it into a dataframe
    training_data = pd.read_csv(TRAINING_DATA_LOC)

    preprocessing = DataPreprocess(training_data)
    rnn_model, history = build_rnn_model(preprocessing.padded_data,
                                         preprocessing.target_classes,
                                         preprocessing.embedding_layer)
    plot_training_history(rnn_model, history, preprocessing.padded_data,
                          preprocessing.target_classes)

Exemplo n.º 15

0

Exibir arquivo

def execute(data):
    training_data = pd.read_csv(data)

    preprocessing = DataPreprocess(training_data)
    lstm_model, history = build_lstm_model(preprocessing.X_t,
                                         preprocessing.target_classes,
                                         preprocessing.embedding_layer)
    plot_training_history(lstm_model, 
                          history,
                          preprocessing.X_t,
                          preprocessing.target_classes)

Exemplo n.º 16

0

Exibir arquivo

Arquivo: test_data_loaders.py Projeto: denrun-p/data_preprocessing

    def test_list_pipeline(self):
        config = {
            "data_loader": {
                "type": "list"
            },
            "steps": [
                {
                    "name": "normalize_text",
                    "type": "lowercase",
                    "log_level": "INFO"
                },
            ],
        }
        loader = DataPreprocess(config)
        data = []
        for batch in loader.process_data(TEST_LIST):
            for item in batch:
                data.append(item["data"])

        test = [item.lower() for item in TEST_LIST]
        self.assertEqual(test, data)

Exemplo n.º 17

0

Exibir arquivo

Arquivo: main.py Projeto: MrGuo2/MyProject

def main():
    logger.info(SEPARATOR)
    configs = get_configs()

    # Run data preprocess
    if configs.data_preprocess_active:
        logger.info(configs)
        preprocess = DataPreprocess(configs)
        preprocess.data_preprocess()
        logger.info("Data preprocess finished!")
    # Run train model
    if configs.da_rnn_model_active:
        logger.info(configs)
        da_rnn_model = DaRnnModel(configs)
        da_rnn_model.run()
        logger.info("Da_rnnModel finished!")
    if configs.xgboost_gridsearch_model_active:
        xgboost_model = XgboostGridSearchModel(configs)
        xgboost_model.run()
        logger.info("XGboost finished!")
    if configs.tcn_big_file_model_active:
        tcn_model = TcnModel(configs)
        tcn_model.run()
        logger.info("TcnModel finished!")

Exemplo n.º 18

0

Exibir arquivo

    def read_data_from_disk(self, queue):  # optional pre-processing arguments

        dataproc = DataPreprocess(queue,
                                  coords=self.coord,
                                  dataset_catgry=self.dataset_catgry,
                                  dataset_type=self.data_type)

        h, w = self.params['input_size']
        dstep = self.params['dce_dstep']

        if self.data_type == 1:

            color_img = dataproc.load_colorimg(height=h, width=w, channels=3)
            depth_gt = dataproc.load_depthgt(height=h, width=w, channels=1)
            subsample_depth = dataproc.load_subsampledepth(height=h,
                                                           width=w,
                                                           channels=1)

            if self.params['orig_normalizefac']:
                depth_gt = tf.cast(depth_gt, tf.float32) * 100 / 256
                subsample_depth = tf.cast(subsample_depth,
                                          tf.float32) * 100 / 256
            else:
                depth_gt = tf.cast(depth_gt, tf.float32)
                subsample_depth = tf.cast(subsample_depth, tf.float32)

            color_img = tf.slice(color_img,
                                 [self.params['truncated_height_start'], 0, 0],
                                 [self.params['truncated_height_end'], w, 3])
            depth_gt = tf.slice(depth_gt,
                                [self.params['truncated_height_start'], 0, 0],
                                [self.params['truncated_height_end'], w, 1])
            subsample_depth = tf.slice(
                subsample_depth, [self.params['truncated_height_start'], 0, 0],
                [self.params['truncated_height_end'], w, 1])

        if self.datainput in ['color_dc_dcclabels']:
            color_img_processed = dataproc.preprocess_color(
                color_img, self.params['coloraugmentflag'])
            depth_gt = tf.cast(depth_gt, tf.float32)

            if self.params['Gen_uniformsampflag']:
                subsample_depth = dataproc.uniform_sampling(
                    depth_gt, self.params['Uniform_samp'])

            subsampledepth_dcc = dataproc.depth_2_dcc_channelsgeneralize(
                subsample_depth,
                dstep,
                self.params['depth_maxrange'],
                spatial_dim=(self.params['truncated_height_end'], w))
            depth_gt_dcc = dataproc.depth_2_dcc_channelsgeneralize(
                depth_gt,
                self.params['dce_dstep'],
                self.params['depth_maxrange'],
                spatial_dim=(self.params['truncated_height_end'], w))
            #depth_gt_dcc = dataproc.depth_2_dcc_channelsgeneralize(depth_gt, dstep, self.params['depth_maxrange'], oorFlag = False, spatial_dim = (h,w))
            depth_gt_dcc = tf.squeeze(depth_gt_dcc)

            data_processed = tf.concat([
                color_img_processed, subsampledepth_dcc, depth_gt_dcc, depth_gt
            ],
                                       axis=2)

        else:
            ValueError('Data-Input Type is Unrecognized. Exiting ...\n')

        if self.random_mirror:
            data_processed = data_mirroring(data_processed)

        if self.random_crop:
            data, labels = self.random_crop_and_pad_data_and_labels(
                data_processed, self.params['crop_size'][0],
                self.params['crop_size'][1])
        else:
            data, labels = self.crop_pad_data_labels(
                data_processed, self.params['crop_size'][0],
                self.params['crop_size'][1])

        return data, labels

Exemplo n.º 19

0

Exibir arquivo

Arquivo: train_model.py Projeto: nitinmlvya/Text-Tagging

 def __init__(self):
     self.model_path = r'models/svc_model.pkl'
     self.data_preprocessing = DataPreprocess()
     self.build_features = BuildFeatures()
     self.X = None
     self.y = None

Exemplo n.º 20

0

Exibir arquivo

Arquivo: train_model.py Projeto: nitinmlvya/Text-Tagging

class TrainModel:
    def __init__(self):
        self.model_path = r'models/svc_model.pkl'
        self.data_preprocessing = DataPreprocess()
        self.build_features = BuildFeatures()
        self.X = None
        self.y = None

    def run(self):
        # data preprocessing pipeline
        self.data_preprocessing.load_csv()
        self.data_preprocessing.clean_conversation()
        self.data_preprocessing.extract_meaning_phrases()
        self.data_preprocessing.group_convs_by_file_id()
        self.data_preprocessing.rm_dups_phrases_in_same_conv()
        self.X, self.y = self.data_preprocessing.get_X_y()

        # with open('X.pkl', 'rb') as fp:
        #     self.X = pickle.load(fp)
        # self.X = [list(a) for a in self.X]
        #
        # with open('y.pkl', 'rb') as fp:
        #     self.y = pickle.load(fp)

        # Train and test set
        X_train, X_test, y_train, y_test = train_test_split(self.X,
                                                            self.y,
                                                            test_size=0.1,
                                                            stratify=self.y)

        # build features
        # oversampling on training data only
        X_train, y_train = self.build_features.oversampling_on_training_data(
            X_train, y_train)

        # X_train = [' '.join(a).replace('[PAD]', '').strip() for a in X_train]
        # X_test = [' '.join(a).replace('[PAD]', '').strip() for a in X_test]

        # Word to vectors
        self.build_features.word_to_vectors_model(X_train)
        X_train = self.build_features.word_to_vectors_transformed(X_train)
        X_test = self.build_features.word_to_vectors_transformed(X_test)

        # Dimenstion reduction technique.
        self.build_features.dimension_reduction_model(X_train)
        X_train = self.build_features.dimension_reduction_transformed(X_train)
        X_test = self.build_features.dimension_reduction_transformed(X_test)

        # train model
        model = LinearSVC(random_state=25)
        model.fit(X_train, y_train)
        print('\n\n')
        print('-*-' * 20)
        print('Training accuracy: ', model.score(X_train, y_train) * 100)
        print('Accuracy on unseen documents: ',
              model.score(X_test, y_test) * 100)
        print('-*-' * 20)
        pickle.dump(model, open(self.model_path, 'wb'))  # save

Exemplo n.º 21

0

Exibir arquivo

Arquivo: wifi_process_combo.py Projeto: z37070/presence_detection_cnn

def main():
    args = get_input_arguments()
    training_mode = (args.mode == 'Y')
    if args.mode not in ['Y', 'N']:
        raise ValueError('Invalid input value for m should be either Y or N')
    data_folder = conf.data_folder
    if training_mode:
        label = conf.train_label
        print('in training mode')
        print('training data from {} \nvalidation data from {}\n'.format(
            conf.training_date, conf.training_validate_date))
        print('training label is {}\n'.format(label))
        data_folder += "training/"
    else:
        label = conf.test_label
        print('in test mode')
        print('test date from {}'.format(conf.test_date))
        print('test label is {}\n'.format(label))
        data_folder += "test/"
    ##################################################
    # parse data from original data & construct images
    ##################################################
    print(
        "parsing data from log files which are generated by Atheros-CSI-TOOL\n"
    )
    data_generator = DataLogParser(conf.n_timestamps, conf.D, conf.step_size,
                                   conf.ntx_max, conf.nrx_max,
                                   conf.nsubcarrier_max, data_folder,
                                   conf.log_folder, conf.skip_frames,
                                   conf.time_offset_ratio, conf.day_conf,
                                   label)
    train_date = conf.training_date if training_mode else []
    if training_mode:
        data_generator.generate_image(conf.training_date,
                                      conf.training_validate_date)
    else:
        data_generator.generate_image([], conf.test_date)
    # train_data, test_data: classes (key: label, value: images under this label)
    train_data, test_data = data_generator.get_data()

    ##################################################
    # apply signal processing blocks to images
    ##################################################
    print("Pre-processing data\n")
    data_process = DataPreprocess(conf.n_timestamps, conf.D, conf.step_size,
                                  conf.ntx_max, conf.ntx, conf.nrx_max,
                                  conf.nrx, conf.nsubcarrier_max,
                                  conf.nsubcarrier, conf.data_shape_to_nn,
                                  data_folder, label)
    data_process.load_image(training_mode, False, train_data, test_data)
    data_process.signal_processing(conf.do_fft, conf.fft_shape)
    data_process.prepare_shape()
    x_train, y_train, x_test, y_test = data_process.get_data()
    ##################################################
    # train or test data with neural netowrk
    ##################################################

    nn_model = NeuralNetworkModel(conf.data_shape_to_nn, conf.abs_shape_to_nn,
                                  conf.phase_shape_to_nn, conf.total_classes)
    nn_model.add_data(x_train, y_train, x_test, y_test)
    if training_mode:
        print("Building a new model (in training mode)\n")
        nn_model.cnn_model_abs_phase()
        nn_model.fit_data(conf.epochs)
        nn_model.save_model(conf.model_name)
    else:
        print("Get test result using existing model (in test mode)\n")
        nn_model.load_model(conf.model_name)
        result = nn_model.get_test_result(label)
        # nn_model.save_result(result, conf.file_prefix+conf.test_result_filename)
    nn_model.end()
    print("Done!")

Exemplo n.º 22

0

Exibir arquivo

Arquivo: classifier.py Projeto: nitinmlvya/Text-Tagging

class Classifier:
    def __init__(self):
        parser = argparse.ArgumentParser(description='List the content of a folder')
        parser.add_argument('--text_file', type=str, help='File path to classify')
        args = parser.parse_args()
        self.text_file_path = args.text_file
        self.df = None
        self.X = None

        self.word_to_vector_model_path = r'models/w2v.pkl'
        self.dim_reduction_path = r'models/dim_reduction.pkl'
        self.model_path = r'models/svc_model.pkl'
        self.data_preprocessing = DataPreprocess()
        self.build_features = BuildFeatures()

    def read_text_file(self):
        with open(self.text_file_path) as fp:
            text = [x.strip('\r\n') for x in fp.readlines()]
            return text

    def create_dataframe(self, text):
        self.df = pd.DataFrame(text, columns=['conversation'])

    def load_model(self):
        return pickle.load(open(self.model_path, 'rb'))


    def run(self):
        text = self.read_text_file()
        self.create_dataframe(text)
        # data preprocessing pipeline
        self.data_preprocessing.test_fill_df(self.df)
        self.data_preprocessing.clean_conversation()
        self.data_preprocessing.extract_meaning_phrases()
        self.data_preprocessing.test_group_convs()
        self.data_preprocessing.rm_dups_phrases_in_same_conv()
        X_test = self.data_preprocessing.test_get_X()
        print(len(X_test))

        # Word to vectors
        X_test = self.build_features.word_to_vectors_transformed(X_test)
        # Dimenstion reduction technique.
        X_test = self.build_features.dimension_reduction_transformed(X_test)

        model = self.load_model()
        print('-*-' * 20)
        predicted_class = model.predict(X_test)
        print('Result: ', predicted_class)