Python Preprocessing.Preprocessing 예제들, preprocessing.Preprocessing.Preprocessing Python 예제들

예제 #1

0

파일 보기

파일: svm.py 프로젝트: tabilab-dip/sentiment-embeddings

def train_and_test_separate_files(train_path, test_path):
    """
    This function trains the model using the training data given in the path above (train_path).
    This also evaluates the test data specified in the path above (test_path).

    :param train_path: The path to the training data. This is located in the "input" folder.
    :type train_path: str
    :param test_path: The path to the test data. This also is located in the "input" folder.
    :type test_path: str
    :return: None. It just prints out the accuracy of the model.
    :rtype: None
    """

    tr_reviews, tr_labels = Preprocessing().get_data(train_path)
    X_train, y_train = np.array(tr_reviews), np.array(tr_labels)

    model, sf, tr_vecs, imp = generate_model(X_train, y_train)

    test_reviews, test_labels = Preprocessing().get_data(test_path)
    X_test, y_test = np.array(test_reviews), np.array(test_labels)

    y_pred = test_model(model, sf, tr_vecs, imp, X_test)

    accuracy = accuracy_score(y_test, y_pred)
    print('Accuracy: {:.2f}%'.format(accuracy * 100))

예제 #2

0

파일 보기

    def product_scenario(self):
        _config_before = config_train('before', 'PALSAR')

        _log_file = _config_before.LOG_DIR + '/' + _config_before.LOG_FILENAME
        self.__logger = Logger('LandslideWatcher', _config_before.LOG_OUTPUT,
                               _log_file, _config_before.LOG_LEVEL,
                               _config_before.LOG_FORMAT)

        _preprc_before = Preprocessing(_config_before, self.__logger)
        _preprc_before.run()

        _config_after = config_train('after', 'PALSAR')
        _preprc_after = Preprocessing(_config_after, self.__logger)
        _preprc_after.run()

예제 #3

0

파일 보기

    def model_training(self):
        '''
        Train the model
        '''

        pre = Preprocessing()
        print('Loading data')

        df = self.data.read_data(etapa_treino=True)

        print('Training preprocessing')
        #Dataset splited and processed
        X, y, features = pre.process(df, etapa_treino=True)

        #Standardized with scaler
        scaler = StandardScaler()
        scaled = scaler.fit_transform(X, y)

        #Create model
        linear_regression_model = linear_model.LinearRegression()
        rf = RandomForestRegressor()

        #Train data
        model = rf.fit(X, y)

        return model, features
        '''

예제 #4

0

파일 보기

파일: dictionary_creator.py 프로젝트: hshahsahebi/SpellCheckDoc

def create_dictionary(in_file = None, passage = None):
    if(in_file != None and os.path.exists(in_file)):
        corpus_lines = open(in_file, 'r', encoding='utf-8').read().split("\n")
    elif(passage != None):
        corpus_lines = passage.split("\n")
    else:
        print("Invalid input!")
        return

    d = Dictionary()
    p = Preprocessing()
    
    if(d.database_exists(d.DB_DICTIONARY)):
        return False
    

    for line in corpus_lines:
        words = p.fetch_line_words(line)
        
        for word in words:
            main_word = re.sub(r"[^-A-Za-z0-9]", '', word[0])
            root = p.fetch_lemmatized_word(main_word, word[1])
            d.prepare_word2dic(main_word, root)

    return d.store_prepared_data()

예제 #5

0

파일 보기

파일: dictionary_creator.py 프로젝트: hshahsahebi/SpellCheckDoc

def create_bigram(in_file = None, passage = None):
    if(in_file != None and os.path.exists(in_file)):
        corpus_lines = open(in_file, 'r', encoding='utf-8').read().split("\n")
    elif(passage != None):
        corpus_lines = passage.split("\n")
    else:
        print("Invalid input!")
        return

    d = Dictionary()
    p = Preprocessing()

    if(d.database_exists(d.DB_BIGRAM)):
        return False
    

    for line in corpus_lines:
        words = p.fetch_line_words(line, escape_symbols=False)
        prev_word = (None, None)

        for word in words:
            d.prepare_bigram2dic(word, prev_word)
            prev_word = word

    return d.store_prepared_data()

예제 #6

0

파일 보기

파일: extract.py 프로젝트: linjieli222/HERO_Video_Feature_Extractor

def main():
    """
    Main function to extract features.
    """
    args = parse_args()
    cfg = load_config(args)
    # Set random seed from configs.
    np.random.seed(cfg.RNG_SEED)
    th.manual_seed(cfg.RNG_SEED)
    failed_log = open(args.csv.split(".csv")[0]+"_failed.txt", "w")
    assert args.target_framerate % args.min_num_features == 0

    preprocess = Preprocessing(
        "3d", cfg, target_fps=args.target_framerate,
        size=224, clip_len=args.clip_len, padding_mode='tile',
        min_num_clips=args.min_num_features)
    if args.dataflow:
        readvideo = ReadVideo(
            preprocess,
            framerate=args.target_framerate,
            size=224,
            centercrop=True,
            pix_fmt=args.pix_fmt,
            overwrite=args.overwrite
        )
        dataset = VideoDataFlow(args.csv)
        # dataset = MultiThreadMapData(
        #     dataset, num_thread=args.num_decoding_thread,
        #     map_func=readvideo,
        #     buffer_size=1000)
        # loader = MultiProcessRunnerZMQ(
        #     dataset, num_proc=1)
        loader = MultiProcessMapDataZMQ(
            dataset, num_proc=args.num_decoding_thread,
            map_func=readvideo, strict=True)
        loader.reset_state()
        n_dataset = len(dataset)
    else:
        dataset = VideoLoader(
            args.csv,
            preprocess,
            framerate=args.target_framerate,
            size=224,
            centercrop=True,
            pix_fmt=args.pix_fmt,
            overwrite=args.overwrite
        )
        n_dataset = len(dataset)
        sampler = RandomSequenceSampler(n_dataset, 10)
        loader = DataLoader(
            dataset,
            batch_size=1,
            shuffle=False,
            num_workers=args.num_decoding_thread,
            sampler=sampler if n_dataset > 10 else None,
        )

    model = build_model(cfg)
    perform_test(
        loader, model, preprocess, cfg, args, failed_log, n_dataset)

예제 #7

0

파일 보기

파일: BaseNN.py 프로젝트: SatenHarutyunyan/piano_solo_separation

    def __init__(self, train_features_dir, val_features_dir, test_audios_dir,
                 num_epochs, train_batch_size, val_batch_size, learning_rate,
                 base_dir, max_to_keep,
                 model_name):  #test_features_dir, test_batch_size

        self.data_loader = DataLoader(train_features_dir, val_features_dir,
                                      train_batch_size, val_batch_size)
        self.test_audios_dir = test_audios_dir
        self.train_batch_size = train_batch_size
        self.val_batch_size = val_batch_size
        self.learning_rate = learning_rate
        self.num_epochs = num_epochs
        self.base_dir = base_dir
        self.model_name = model_name
        self.max_to_keep = max_to_keep
        self.checkpoint_dir = os.path.join(self.base_dir, self.model_name,
                                           "checkpoints")
        if not os.path.exists(self.checkpoint_dir):
            os.makedirs(self.checkpoint_dir)
        self.summary_dir = os.path.join(self.base_dir, self.model_name,
                                        "summaries")
        if not os.path.exists(self.summary_dir):
            os.makedirs(self.summary_dir)
        prep = Preprocessing()
        self.sample_rate = prep.sample_rate
        self.frame_dim = int(prep.frame_len / 2) + 1
        self.frame_count = prep.frame_count
        self.preprocess_test_data = prep.preprocess_test_data
        self.preprocess_test_data_unet = prep.preprocess_test_data_unet
        self.produce_time_outputs = prep.produce_time_outputs
        self.time_outputs_into_track = prep.time_outputs_into_track
        self.produce_time_outputs_unet = prep.produce_time_outputs_unet
        self.time_outputs_into_track_unet = prep.time_outputs_into_track_unet

예제 #8

0

파일 보기

파일: emotionnetwork.py 프로젝트: pieterwolfert/engagement-l2tor

def main():
    image_shape = (128, 128, 3)
    datadir = "/home/awolfert/projects/engagement-l2tor/data/emotions/"
    prep = Preprocessing(datadir, "x_train3.txt", "x_test3.txt", "x_val3.txt",\
        "y_train3.txt", "y_test3.txt", "y_val3.txt")
    x_train, y_train, x_val, y_val = prep.getTrainData(trim=True, img_shape=image_shape)
    train(x_train, y_train, x_val, y_val, image_shape)

예제 #9

0

파일 보기

    def fit_model(self, df):
        start = time()
        logger.debug('Fitting model to data')

        # Preprocessing the dataset (cleaning, reformatting)
        pp = Preprocessing()

        self.status = "preprocessing : cleaning, reformatting"

        pp.fit(df)

        train = pp.transform(df)
        logger.debug('DataFrame shape : %s, %s' %
                     (train.shape[0], train.shape[1]))

        logger.debug('Target distribution: %s' %
                     (train['product_rating'].value_counts()))

        # Tuning model's hyperparameters with a Bayesian Optimizer
        self.status = "tuning model's hyperparameters"
        params = self.model_tuning(train)

        # Fitting data to our model with updated parameters
        self.status = "fitting data to the model"
        xgbclf = xgb.XGBClassifier(**params)
        xgbclf.fit(train.drop('target', 1), train.target)
        logger.debug('Time elapsed : %0.3fs' % (time() - start))

        # Saving model
        joblib.dump(xgbclf, folder_name + "xgb.model")
        joblib.dump(pp, folder_name + 'preprocessing.model')
        logger.debug('Model for XGBoost & preprocessing data saved.')

예제 #10

0

파일 보기

파일: experiments.py 프로젝트: klimber/learning-to-code

 def run_experiment(self):
     '''
     Run especified experiments
     :return: Dict with metrics
     '''
     pre = Preprocessing()
     print('Reading Data')
     train_df = DataSource().read_data(etapa_treino=True)
     test_df, y_test = DataSource().read_data(etapa_treino=False)
     y_test = y_test['SalePrice']
     print('Preprocessing Data')
     X_train, y_train = pre.process(train_df, etapa_treino=True)
     print('Processing Test Data')
     X_test = pre.process(test_df[pre.train_features], etapa_treino=False)
     print('Training Model')
     models = Experiments().train_model(X_train, y_train)
     print('Running Metrics')
     for model in models.keys():
         print(model)
         y_pred = models[model].predict(X_test)
         print(Metrics().calculate_regression(y_test, pd.Series(y_pred)))
         metrics = Metrics().calculate_regression(y_test, pd.Series(y_pred))
         pd.DataFrame.from_dict(
             metrics, orient='index').to_csv('../output/' + model + '.csv')
     return metrics

예제 #11

0

파일 보기

def run_parallel(olo, cv, cmplt):
    random.seed(SEED_NUMBER)
    makedirs(REPORT_PATH, exist_ok=True)
    makedirs(join(REPORT_PATH, DATA_HEADER), exist_ok=True)

    data = Preprocessing()
    data.main(olo, cv, cmplt)
    if data.data_train_folds:
        n_jobs = data.data_train_folds.__len__()
    else:
        n_jobs = AVG_COUNT

    start = time.time()
    arg_instances = [[idx, data] for idx in range(n_jobs)]
    results = Parallel(n_jobs=n_jobs, verbose=1,
                       backend="multiprocessing")(map(delayed(handle_model),
                                                      arg_instances))
    end = time.time()
    print('multi-threading time = {:.3f}'.format((end - start) / 60))

    ml_performance = [result[0] for result in results]
    class_precision = [result[1] for result in results]
    track_to_plot = [result[2] for result in results]

    avg_perf = avg_performance(ml_performance)
    print('Average ML performance:')
    [
        print(metric + ' ' + str('%.5f' % val))
        for metric, val in avg_perf.items()
    ]

    avg_precision = avg_performance(class_precision)
    plot_bar(avg_precision, 'precision')
    plot_records(track_to_plot)

예제 #12

0

파일 보기

    def classifyWithModel(self, model, sentence, preprocess=None):
        if model != None:
            if preprocess is None:
                preprocess = Preprocessing()
            sentence = preprocess.process(sentence)
            sentence_split = sentence.split(" ")

            clas = {}
            for c in model['clas']:
                vj = model['prior'][c]
                for cc in sentence_split:
                    if cc in model['cond_prob'][c]:
                        vj *= model['cond_prob'][c][cc]
                clas[c] = vj

            i = 0
            prev = 0
            curr = 0
            argmax = ''
            for c in model['clas']:
                curr = clas[c]
                if (curr > prev):
                    argmax = c
                    prev = curr

            print("Test data : ", sentence)
            print('Class : ', argmax)

            return argmax

        else:
            print("No model!")

        return False

예제 #13

0

파일 보기

파일: app.py 프로젝트: fahmiansori/sa_v2

    def preprocessingText(self, doPreprocessing, progress, qc):
        if self.con != None:
            if self.training_table:
                self.dataTraining = self.con.getDataAsDF(self.training_table)
                progress.setValue(1)
                if self.dataTraining is not None:
                    p = Preprocessing(con=self.con)
                    progressP = 1
                    progressS = (99 - progressP) / len(self.dataTraining.index)
                    for index, row in self.dataTraining.iterrows():
                        text = row[self.text_col]

                        if doPreprocessing:
                            pretext = p.process(text)
                            pretext = pretext['stemmed_text']
                        else:
                            pretext = p.processNoPre(text)

                        self.dataTraining.at[index, self.text_col] = pretext
                        progressP += progressS
                        progress.setValue(progressP)
                        qc.processEvents()
                    qc.processEvents()
                    progress.setValue(99)
            else:
                print("No training table!")
        progress.setValue(100)

예제 #14

0

파일 보기

    def feature_size_acc(dataset, model, **kwargs):
        fractions = [0.2, 0.4, 0.6, 0.8, 1]
        accuracies = []

        for frac in fractions:
            if dataset == Dataset.imdb_reviews:
                dataset_features = imdb_feature_n
            elif dataset == Dataset.twenty_news:
                dataset_features = twenty_features_n
            num_features = int(frac * dataset_features)
            preprocessor = Preprocessing(dataset, max_features=num_features)
            set_ = preprocessor.get_train_test()
            acc = evaluate_model(model,
                                 dataset,
                                 set_,
                                 verbose=False,
                                 show_plot=False,
                                 **kwargs)
            accuracies.append(acc)
        df = pd.DataFrame(columns=["Fraction of features", "Accuracy"])
        df["Fraction of features"] = fractions
        df["Accuracy"] = accuracies
        df.plot.line(x="Fraction of features", y="Accuracy")
        plt.title("training set feature size vs accuracy {}-{}".format(
            dataset, model))
        plt.show()

예제 #15

0

파일 보기

    def classify(self,sentence,preprocess=None):
        if self.model != None:
            if preprocess is None:
                preprocess = Preprocessing()
            sentence = preprocess.process(sentence)
            sentence_split = sentence['stemmed_text'].split(" ")

            clas = {}
            for c in self.model['clas']:
                vj = self.model['prior'][c];
                for cc in sentence_split:
                    if cc in self.model['cond_prob'][c]:
                        vj*=self.model['cond_prob'][c][cc]
                clas[c] = vj

            i = 0
            prev = 0;
            curr = 0;
            argmax = ''
            for c in self.model['clas']:
                curr = clas[c];
                if(curr > prev):
                    argmax = c
                    prev = curr

            print("Test data : ",sentence)
            print('Class : ',argmax)

            return argmax

        else:
            print("No model!")

        return False

예제 #16

0

파일 보기

파일: train.py 프로젝트: chan8616/PoAI

def do(config):
    # 데이터 읽기 & 전처리
    print("Read data")
    ds = Datasets(config.data_path)
    data = ds.read_data()

    print("Data preprocessing..")
    preprocessing = Preprocessing(config)
    X = preprocessing.do(data)

    print('Train model')

    if config.sg == 'CBOW':
        model = Word2Vec(
                    sentences=X,
                    size=config.size,
                    window=config.window,
                    min_count=config.min_count,
                    workers=config.workers,
                    sg=0
        )
    else:
        model = Word2Vec(
            sentences=X,
            size=config.size,
            window=config.window,
            min_count=config.min_count,
            workers=config.workers,
            sg=1
        )

    print(model.wv.vectors.shape)

    model.save(os.path.join(config.save_directory, config.ckpt_name))

예제 #17

0

파일 보기

파일: train_simulator.py 프로젝트: gilgithub42/Self-Aware-Driving-Patate

	def __init__(self, args):
		'''
		run a DDQN training session, or test it's result, with the donkey simulator
		'''
		self.args = args
		self = init_simulator(self)
		# Construct gym environment. Starts the simulator if path is given.
		self.env = gym.make(
			self.args.env_name, conf=self.conf)

		self.memory = deque(maxlen=10000)
		# Get size of state and action from environment
		self.state_size = (img_rows, img_cols, img_channels)
		self.action_space = self.env.action_space  # Steering and Throttle
		self.agent = DQNAgent(self.state_size,
							  self.action_space,
							  input_shape=(img_rows, img_cols, img_channels),
							  output_size=turn_bins,
							  train=not args.test)
		self.preprocessing = Preprocessing()
		if os.path.exists(args.model):
			print("load the saved model")
			self.agent.load_model(args.model)
		try:
			self.run_ddqn()
		except KeyboardInterrupt:
			print("stopping run...")
		finally:
			self.env.unwrapped.close()

예제 #18

0

파일 보기

파일: main.py 프로젝트: SeleneCaroVia/EEG_mne_template

def main():
    print(Welcome.WELCOME)
    #Init visualization
    v = Visualization()
    #Ask the data to load
    data_file = input(Data.Q_DATA_2_LOAD)
    #Save the data file to the preprocessing class
    pp = Preprocessing(data_file)

    #Ask if the user wants to see the raw data
    v.show_raw_information(pp.raw)

    #Ask if the user wants to see les raw data time
    pp.decrease_time_channels()

    #Ask and apply a notch filter if required
    if (pp.notch_filter() == Notch_filter.APPLY_NOTCH_FILTER):
        #If the user has applied the filter, we ask if wants to see the results
        v.plot_data(pp.raw)

    #Ask and apply a bandpass filter if required
    bandpass_filter(pp)

    #Ask and apply an ica filter if required
    if (pp.ica_filter(v) == ICA_filter.APPLY_ICA_FILTER):
        print("TODO: Crec que s'ha de treure")

예제 #19

0

파일 보기

    def predict():

        INV_CLASS = {
            0: 'Black-grass',
            1: 'Charlock',
            2: 'Cleavers',
            3: 'Common Chickweed',
            4: 'Common wheat',
            5: 'Fat Hen',
            6: 'Loose Silky-bent',
            7: 'Maize',
            8: 'Scentless Mayweed',
            9: 'Shepherds Purse',
            10: 'Small-flowered Cranesbill',
            11: 'Sugar beet'
        }

        preprocessing = Preprocessing()
        model = CNN_NET(4, 12, 0).to('cuda')
        predict_data = preprocessing.test_data_read()
        predict_input = torch.from_numpy(predict_data['image'])
        predict_dataset = torch.utils.data.TensorDataset(predict_input)
        predict_set = torch.utils.data.DataLoader(predict_dataset,
                                                  batch_size=32)
        model.load_state_dict(torch.load('./model.pth'))
        prediction = predict(model, 'cuda', predict_set)
        predict_data['label'] = prediction

        with open('submission.csv', 'w', encoding='utf-8') as f:
            f.write('file,species' + '\n')
            for i in range(len(predict_data['id'])):
                f.write(predict_data['id'][i] + ',' +
                        INV_CLASS[prediction[i]] + '\n')

예제 #20

0

파일 보기

    def fine_tune(epochs=20, dropout=0.25, lr=1e-5):
        model = CNN_NET(4, 12, dropout).to('cuda')
        preprocessing = Preprocessing()
        data = preprocessing.read_image()
        data['image'] = torch.from_numpy(data['image'])
        data['label'] = torch.from_numpy(data['label'])
        train_set = torch.utils.data.TensorDataset(data['image'],
                                                   data['label'])
        indices = np.random.randint(low=0,
                                    high=len(train_set) - 1,
                                    size=len(train_set))
        test_sequence = torch.from_numpy(indices)
        test_set = torch.utils.data.Subset(train_set, test_sequence[0:400])
        train_set_split = torch.utils.data.Subset(train_set,
                                                  test_sequence[400:])
        train_loader = torch.utils.data.DataLoader(dataset=train_set,
                                                   batch_size=64,
                                                   shuffle=True,
                                                   num_workers=4)
        test_loader = torch.utils.data.DataLoader(dataset=test_set,
                                                  batch_size=32)
        print('Total length: ', len(train_set))
        print('train_length:', len(train_set_split))
        print('test_length:', len(test_set))

        try:
            model.load_state_dict(torch.load('./model.pth'))
        except FileNotFoundError:
            print('Initialization')
        optimizer = optim.Adam(model.parameters(), lr=lr)
        for epoch in range(epochs):
            train(model, 'cuda', train_loader, optimizer, epoch=10)
            test(model, 'cuda', test_loader)
            torch.save(model.state_dict(), './model.pth')

예제 #21

0

파일 보기

파일: detection.py 프로젝트: ainharan/blur-detection

def get_data():
    '''
    Returns np array of labelled data
    '''
    labels = []
    x = []
    test = []
    paths = []
    pp = Preprocessing()

    # get list of data files
    flare_list = pp.get_training_data(pp.flare_path)
    blurry_list = pp.get_training_data(pp.blurry_path)
    good_list = pp.get_training_data(pp.good_path)

    paths = [pp.flare_path] * len(flare_list)
    paths = paths + [pp.blurry_path] * len(blurry_list)
    paths = paths + [pp.good_path] * len(good_list)

    test = flare_list + blurry_list + good_list

    # label flare files as 1 and add to training set
    append_sets(flare_list, pp.flare_path, x, labels, 1)
    # label blurry files as 2 and add to training set
    append_sets(blurry_list, pp.blurry_path, x, labels, 2)
    # label good files as 3 and add to training set
    append_sets(good_list, pp.good_path, x, labels, 0)

    return np.float32(x), np.array(labels, dtype=np.int32), test, paths

예제 #22

0

파일 보기

파일: model_training.py 프로젝트: cristianegea/Codenation

 def model_training(self):
     '''
     Train the model.
     :return: Dict with trained model, preprocessing used and columns used in training
     '''
     # no momento em que o modelo é treinado, a classe "Preprocessing" é chamada
     pre = Preprocessing()
     print('Loading data')
     # Leitura dos dados na etapa de treino
     df = self.data.read_data(etapa_treino=True)
     print('Training preprocessing')
     # Pré-processamento dos dados de treino ("X_train" e do "y_train")
     # os dados de treino vêm junto com a label
     X_train, y_train = pre.process(df, etapa_treino=True)
     print('Training Model')
     # Chamando uma regressão linear
     model_obj = LinearRegression()
     # Ajustamento do modelo aos dados (para o modelo começar a aprender com os dados)
     model_obj.fit(X_train, y_train)
     # Retorno da função "model_training" (a função retorna um dicionário)
     model = {
         'model_obj': model_obj,  # objeto do modelo treinado        
         'preprocessing': pre,  # preprocessamento criado
         'colunas': pre.feature_names
     }  # nome das features que foram treinadas junto com o modelo
     print(model)
     # Salvando o output do modelo
     dump(model, '../output/modelo.pkl')
     return model

예제 #23

0

파일 보기

    def __init__(self,
                 lang_code,
                 method="LSA",
                 n_words=200,
                 k=1,
                 sv_threshold=0.5,
                 min_df=0,
                 max_df=.1,
                 use_idf=True):

        self.lang_code = lang_code
        self.method = method
        self.n_words = n_words
        self.k = k  # num topics
        self.sv_threshold = sv_threshold
        self.min_df = min_df
        self.max_df = max_df
        self.use_idf = use_idf
        self.valid_langs = ["en"]

        if self.lang_code in self.valid_langs:
            self.p = Preprocessing(lang_code=lang_code)
            self.tfidf = TfidfVectorizer(min_df=min_df,
                                         max_df=max_df,
                                         use_idf=use_idf)

예제 #24

0

파일 보기

 def test_extract_bigrams(self):
     expected = [
         "football_player", "goal_goal", "cup_league", "result_result"
     ]
     p = Preprocessing(lang_code='en')
     bigrams = p.extract_bigrams(self.token_list)
     self.assertTrue(isinstance(bigrams, list))
     self.assertEqual(bigrams, expected)

예제 #25

0

파일 보기

def main():
    preprocessing = Preprocessing(Config.train_poems_location)
    preprocessing.preprocess()
    model = PoemModel(preprocessed=preprocessing,
                      weight_file=Config.weight_file,
                      window_size=Config.window_size,
                      learning_rate=0.001,
                      batch_size=32)

예제 #26

0

파일 보기

 def test_keep_valid_tokens(self):
     expected = [
         "result", "goal", "game", "cup", "football_player", "cup_league"
     ]
     p = Preprocessing(lang_code=self.lang_code)
     valid_tokens = p.keep_valid_tokens(self.token_list)
     self.assertTrue('goal_goal' and 'result_result' not in valid_tokens)
     self.assertTrue(isinstance(valid_tokens, list))
     self.assertEqual(valid_tokens, expected)

예제 #27

0

파일 보기

 def save_stock(self, stock):
     with open("dataset/" + stock + ".csv", 'w+') as file:
         file.write("date,open,max,min,close,volume,y\n")
         prep = Preprocessing()
         close = np.array(self.data[stock])[:, 4]
         y = prep.create_train_result(close)
         for index in range(len(y)):
             x = ",".join([str(i) for i in self.data[stock][index]])
             file.write(x + "," + str(y[index]) + "\n")

예제 #28

0

파일 보기

def main(args):
    pp = Preprocessing()

    # load data
    print("Loading Data.....\n\n")
    train_block, train_block_label = pp.read_train_file(
        args.train_data, args.train_label)
    test_block = pp.read_test_file(args.test_data)

    # explore data, do some visualization
    print("Exploring Data (see 'fig' folder for visualization) .....\n\n")
    viz = Visualization()

    # histogram for the lpc coefficient distribution
    viz.visualize_lpc_distribution(train_block)

    # histogram for the block length (or point of time) distribution
    viz.visualize_block_length_distribution(train_block)

    # plot one block of lpc coefficient for each speaker to look at the pattern of voice frequency
    viz.visualize_lpc_time_series(train_block)
    viz.visualize_fitted_lpc_series(train_block)

    max_length = 29
    final_block_size = 18

    print("Data Preprocessing (padding to fixed size blocks)....\n\n")
    # Take the best lengths (18), truncate the longer block, and pad the  shorter block by the last row
    train_data = pp.pad_to_fixed_size_blocks(train_block, max_length,
                                             final_block_size)
    test_data = pp.pad_to_fixed_size_blocks(test_block, max_length,
                                            final_block_size)

    # dummy test label for convenience
    test_block_label = [[i] for i in np.zeros(len(test_data))]

    print("Generating Features (for ML Algorithms)... \n\n")

    # Generate fixed length feature vector for traditional machine learning input
    final_train_data = pp.convert_to_vectors(train_data, train_block_label,
                                             final_block_size)
    final_test_data = pp.convert_to_vectors(test_data, test_block_label,
                                            final_block_size)

    # See scatter plot to find out if there is grouping based on feature vector
    viz.lpc_scatter_plot(final_train_data)

    # Looks like there is a grouping, so let's try to classify using some popular algorithm
    model = Models()
    model.run_classification_models(final_train_data, final_test_data)

    print("SVM Prediction Saved (see 'results/submission.txt' )... \n\n")

    #Also try LSTM for classification
    model.run_LSTM_model(np.array(train_data), np.array(train_block_label),
                         np.array(test_data))
    print("LSTM Prediction Saved (see 'results/submission_lstm.txt' )... \n\n")

예제 #29

0

파일 보기

파일: app.py 프로젝트: fahmiansori/sa

    def preprocessing(self, doPreprocessing, doFeatureSelection, take_feature,
                      threshold, progress, qc):
        features = None
        if self.con != None:
            if self.training_table:
                # self.dataTraining = self.con.getDataAsDF(self.training_table)
                progress.setValue(10)
                if self.dataTraining is not None:
                    p = Preprocessing(con=self.con)
                    oritext = None
                    uniqFeature = []
                    features = {}
                    originalFeatureCount = 0
                    progressP = 10
                    progressS = (70 - progressP) / len(self.dataTraining.index)
                    for index, row in self.dataTraining.iterrows():
                        text = row[self.text_col]

                        if doPreprocessing:
                            pretext = p.process(text)
                            oritext = pretext['oritext']
                            pretext = pretext['stemmed_text']
                        else:
                            pretext = p.processNoPre(text)

                        t = p.processNoPre(pretext).split(
                            " ")  # bad performance
                        uniqFeature.extend(t)  # bad performance

                        # print("Ori : ",text)
                        # print("Preprocessed : ",pretext," -> ",row[self.class_col])
                        self.dataTraining.at[index, self.text_col] = pretext
                        progressP += progressS
                        progress.setValue(progressP)
                        # time.sleep(0.5)
                        qc.processEvents()
                    progress.setValue(70)
                    qc.processEvents()
                    uniqFeature = set(uniqFeature)  # bad performance
                    qc.processEvents()
                    features['featurebefore'] = len(
                        uniqFeature)  # bad performance
                    qc.processEvents()
                    progress.setValue(80)

                    features['vsm'] = self.builtVSM(doFeatureSelection,
                                                    take_feature,
                                                    threshold,
                                                    qc=qc)
                    features['oritext'] = oritext
                    progress.setValue(90)
            else:
                print("No training table!")
        progress.setValue(100)

        return features

예제 #30

0

파일 보기

    def buttonClick(self):

        filename = askopenfilename()
        self.filename = filename
        print(filename)  # do her
        self.TLabel1.configure(text=filename)
        self.TLabel1['text'] = filename
        splittingData = Preprocessing(filename)
        self.x_train, self.y_train, self.x_test, self.y_test = splittingData.Split(
        )