Пример #1
0
 def test_load_txt_data(self):
     data = extract_dataset(
         '../resources/FinancialPhraseBank/Sentences_AllAgree.txt')
     data = extract_dataset(
         '../resources/FinancialPhraseBank/Sentences_50Agree.txt')
     data = extract_dataset(
         '../resources/FinancialPhraseBank/Sentences_66Agree.txt')
     data = extract_dataset(
         '../resources/FinancialPhraseBank/Sentences_75Agree.txt')
def preprocessing_oversampling_tdidf(data_path: Text,
                                     preprocessing_text: bool = False):

    data = extract_dataset(data_path)

    if preprocessing_text:
        data = data_preprocessing(data)

    x, y = data['Phrase'], data['Sentiment']

    x_tdidf, tdidf = tdidf_preprocessing(x,
                                         n_gram_range=(1, 3),
                                         max_features=100)

    x_smote, y_smote = smote_oversampling(x_tdidf.toarray(),
                                          y,
                                          random_state=2021)

    return x_smote, y_smote
def preprocessing_oversampling_tdidf(params):

    data_path = params.get('data_path')
    preprocessed = params.get('preprocessed')
    embedding_type = params.get('embedding')
    imbalance = params.get('imbalance')

    data = extract_dataset(data_path)

    if not preprocessed:
        data = data_preprocessing(data)

    x, y = data['Phrase'], data['Sentiment']

    # Data to Embedding
    if embedding_type == TDIDF_EMBEDDING:
        x_emb, tdidf = tdidf_preprocessing(x,
                                           n_gram_range=params['emb_params']['ngram_range'],
                                           max_features=params['emb_params']['max_features'])
        x_emb = x_emb.toarray()
    else:
        x_emb = None

    # Imbalance Data
    if imbalance == SMOTE_IMBALANCE:
        x_smote, y_smote = smote_oversampling(x_emb,
                                              y,
                                              random_state=params['imb_params']['random_state'],
                                              k_neighbors=params['imb_params']['k_neighbors'])

        x_data, y_data = x_smote, y_smote

    else:
        x_data, y_data = x_emb, y

    return x_data, y_data
 def load_data(self, filepath):
     return extract_dataset(filepath)
Пример #5
0
 def test_preprocessing(self):
     data = extract_dataset('../resources/kaggle/train.tsv')
     prep_data = data_preprocessing(data,
                                    # save_dir=PREPROCESSED_DATA_DIR
                                    )
     return prep_data