def test_load_txt_data(self): data = extract_dataset( '../resources/FinancialPhraseBank/Sentences_AllAgree.txt') data = extract_dataset( '../resources/FinancialPhraseBank/Sentences_50Agree.txt') data = extract_dataset( '../resources/FinancialPhraseBank/Sentences_66Agree.txt') data = extract_dataset( '../resources/FinancialPhraseBank/Sentences_75Agree.txt')
def preprocessing_oversampling_tdidf(data_path: Text, preprocessing_text: bool = False): data = extract_dataset(data_path) if preprocessing_text: data = data_preprocessing(data) x, y = data['Phrase'], data['Sentiment'] x_tdidf, tdidf = tdidf_preprocessing(x, n_gram_range=(1, 3), max_features=100) x_smote, y_smote = smote_oversampling(x_tdidf.toarray(), y, random_state=2021) return x_smote, y_smote
def preprocessing_oversampling_tdidf(params): data_path = params.get('data_path') preprocessed = params.get('preprocessed') embedding_type = params.get('embedding') imbalance = params.get('imbalance') data = extract_dataset(data_path) if not preprocessed: data = data_preprocessing(data) x, y = data['Phrase'], data['Sentiment'] # Data to Embedding if embedding_type == TDIDF_EMBEDDING: x_emb, tdidf = tdidf_preprocessing(x, n_gram_range=params['emb_params']['ngram_range'], max_features=params['emb_params']['max_features']) x_emb = x_emb.toarray() else: x_emb = None # Imbalance Data if imbalance == SMOTE_IMBALANCE: x_smote, y_smote = smote_oversampling(x_emb, y, random_state=params['imb_params']['random_state'], k_neighbors=params['imb_params']['k_neighbors']) x_data, y_data = x_smote, y_smote else: x_data, y_data = x_emb, y return x_data, y_data
def load_data(self, filepath): return extract_dataset(filepath)
def test_preprocessing(self): data = extract_dataset('../resources/kaggle/train.tsv') prep_data = data_preprocessing(data, # save_dir=PREPROCESSED_DATA_DIR ) return prep_data