def run_preprocessing(csr: AbstractModel, train_preprocessed_path, test_preprocessed_path, full_data=True): """ Runs the preprocessing methods according to the chosen classifier on the train and test data :param csr: chosen classifier (child of AbstractModel) :type csr: AbstractModel :param train_preprocessed_path: path to load train data :type train_preprocessed_path: str :param test_preprocessed_path: path to load test data :type test_preprocessed_path: str :param full_data: if False, the small dataset (200K rows) is used :type full_data: bool, optional """ # Read data if full_data: dataset_files = [TRAIN_DATA_NEGATIVE_FULL, TRAIN_DATA_POSITIVE_FULL] else: dataset_files = [TRAIN_DATA_NEGATIVE, TRAIN_DATA_POSITIVE] train_preprocessing = Preprocessing(dataset_files, submission=False) test_preprocessing = Preprocessing([TEST_DATA], submission=True) # Preprocess it for method in csr.get_preprocessing_methods(istest=False): getattr(train_preprocessing, method)() for method in csr.get_preprocessing_methods(istest=True): getattr(test_preprocessing, method)() # Save it train_df = train_preprocessing.get() train_df = train_df.sample(frac=1) train_df.to_csv(train_preprocessed_path, index=False) test_preprocessing.get().to_csv(test_preprocessed_path, index=False)
from classes.featureExtraction import FeatureExtraction from classes.sternmuster import Sternmuster from classes.pca import PCA tools = Tools() database = Database() database.initializeEmpty() char_values = string.ascii_uppercase + string.ascii_lowercase # Expected characters in Trainings Set splitted_t_set = [] for t_set in glob.glob('./trainingdata/*.png'): print("Reading image: " + t_set) img = cv2.imread(t_set, cv2.IMREAD_GRAYSCALE) preprocess = Preprocessing(img) preprocess.binariseImg() splitted_chars = preprocess.splitChars() splitted_t_set.append(splitted_chars) # Histogram for i in range(len(char_values)): for font in splitted_t_set: histogram = Histogram(font[i]) database.add(char_values[i], 'histogram', histogram.rowWert2wert()) # Pixel Average for i in range(len(char_values)): for font in splitted_t_set: pix_av = FeatureExtraction(font[i]) database.add(char_values[i], 'pixelAv', pix_av.getpixelaverage())
fake = dataset_fake[analysis] true = dataset_true[analysis] path = "preprocessed_datasets/final_other_dataset.csv" print("") print("PREPROCESSING:") print("") ''' FAKE NEWS DATASET ''' print("INPUT:") print("(TYPE: ", type(fake), ")") print(fake.head(10)) preprocesser_fake = Preprocessing( fake, date, time, analysis=analysis, news_type="fake", language="es") # here you can set the configuration data_fake = preprocesser_fake.run_pipeline() print("") print("FINAL OUTPUT:") if preprocesser_fake.aggregation: print("(TYPE: ", type(data_fake.aggregated), ")") print(data_fake.aggregated) else: print("(TYPE: ", type(data_fake.docvectors), ")") print(data_fake.docvectors) ''' REAL NEWS DATASET ''' print("INPUT:")
from classes.preprocessing import Preprocessing from classes.stats import Stats if __name__ == '__main__': # ************************************************** # # PREPROCESSING OF THE DATASET # # ************************************************** # dir = os.path.dirname(os.path.realpath(__file__)) dataset = pd.read_csv(dir + '/dataset/IMDBDataset.csv') stats = Stats() prep = Preprocessing(dataset) # Make a dictionary by tokenizing all words in the dataset prep.make_dictionary() # Encode all words with integer IDs # Encode only the most used words in the dataset, any other words encode as 0 n_top_used_words = 10000 dataset = prep.encode_dataset_column(df=dataset, field="review", use_top_words=n_top_used_words) # Encode target variables to binary representation dataset = prep.string_to_int(df=dataset, params={"sentiment": {'positive': 1, 'negative': 0}}) # Pad all reviews, remove reviews that have no words, trim reviews that exceed the review_len value review_len = 500 dataset = prep.pad_text(df=dataset, column="review_encoded", min_words=1, max_words=review_len)
time = datetime.now().strftime('%H.%M') generated = pd.read_csv(PATH_TEXTS) print(generated) generated["text"] = generated["text"].apply(lambda s: ast.literal_eval(s)) pp_generated = Preprocessing( generated, date, time, analysis="text", news_type="generated", duplicate_rows_removal=False, lowercasing=False, tokenization=False, lemmatization=False, noise_removal=False, stemming=False, stopword_removal=False, entity_recognition=False, data_augmentation=False, word2vec=True, doc2vec=False, aggregation=True) # here you can set the configuration gen = pp_generated.run_pipeline() dataframe = pd.DataFrame(gen.aggregated, columns=["text"]) dataframe["membership"] = generated["membership"] dataset = pp_generated.shuffle(dataframe).reset_index() dataset.columns = ["old index", "text", "membership"] dataset.index.name = "index"