Пример #1
0
def run_preprocessing(csr: AbstractModel,
                      train_preprocessed_path,
                      test_preprocessed_path,
                      full_data=True):
    """
  Runs the preprocessing methods according to the chosen classifier
    on the train and test data

  :param csr: chosen classifier (child of AbstractModel)
  :type csr: AbstractModel
  :param train_preprocessed_path: path to load train data
  :type train_preprocessed_path: str
  :param test_preprocessed_path: path to load test data
  :type test_preprocessed_path: str
  :param full_data: if False, the small dataset (200K rows) is used
  :type full_data: bool, optional
  """

    # Read data
    if full_data:
        dataset_files = [TRAIN_DATA_NEGATIVE_FULL, TRAIN_DATA_POSITIVE_FULL]
    else:
        dataset_files = [TRAIN_DATA_NEGATIVE, TRAIN_DATA_POSITIVE]

    train_preprocessing = Preprocessing(dataset_files, submission=False)
    test_preprocessing = Preprocessing([TEST_DATA], submission=True)

    # Preprocess it
    for method in csr.get_preprocessing_methods(istest=False):
        getattr(train_preprocessing, method)()

    for method in csr.get_preprocessing_methods(istest=True):
        getattr(test_preprocessing, method)()

    # Save it
    train_df = train_preprocessing.get()
    train_df = train_df.sample(frac=1)

    train_df.to_csv(train_preprocessed_path, index=False)
    test_preprocessing.get().to_csv(test_preprocessed_path, index=False)
Пример #2
0
from classes.featureExtraction import FeatureExtraction
from classes.sternmuster import Sternmuster
from classes.pca import PCA

tools = Tools()
database = Database()
database.initializeEmpty()

char_values = string.ascii_uppercase + string.ascii_lowercase  # Expected characters in Trainings Set

splitted_t_set = []

for t_set in glob.glob('./trainingdata/*.png'):
    print("Reading image: " + t_set)
    img = cv2.imread(t_set, cv2.IMREAD_GRAYSCALE)
    preprocess = Preprocessing(img)
    preprocess.binariseImg()
    splitted_chars = preprocess.splitChars()
    splitted_t_set.append(splitted_chars)

# Histogram
for i in range(len(char_values)):
    for font in splitted_t_set:
        histogram = Histogram(font[i])
        database.add(char_values[i], 'histogram', histogram.rowWert2wert())

# Pixel Average
for i in range(len(char_values)):
    for font in splitted_t_set:
        pix_av = FeatureExtraction(font[i])
        database.add(char_values[i], 'pixelAv', pix_av.getpixelaverage())
Пример #3
0
fake = dataset_fake[analysis]
true = dataset_true[analysis]

path = "preprocessed_datasets/final_other_dataset.csv"

print("")
print("PREPROCESSING:")
print("")
''' FAKE NEWS DATASET '''

print("INPUT:")
print("(TYPE: ", type(fake), ")")
print(fake.head(10))

preprocesser_fake = Preprocessing(
    fake, date, time, analysis=analysis, news_type="fake",
    language="es")  # here you can set the configuration
data_fake = preprocesser_fake.run_pipeline()
print("")
print("FINAL OUTPUT:")

if preprocesser_fake.aggregation:
    print("(TYPE: ", type(data_fake.aggregated), ")")
    print(data_fake.aggregated)

else:
    print("(TYPE: ", type(data_fake.docvectors), ")")
    print(data_fake.docvectors)
''' REAL NEWS DATASET '''

print("INPUT:")
from classes.preprocessing import Preprocessing
from classes.stats import Stats


if __name__ == '__main__':
    
    # ************************************************** #
    #           PREPROCESSING OF THE DATASET             #
    # ************************************************** #

    dir = os.path.dirname(os.path.realpath(__file__))
    dataset = pd.read_csv(dir + '/dataset/IMDBDataset.csv')
    
    stats = Stats()
    prep = Preprocessing(dataset)
    
    # Make a dictionary by tokenizing all words in the dataset
    prep.make_dictionary()
    
    # Encode all words with integer IDs
    # Encode only the most used words in the dataset, any other words encode as 0
    n_top_used_words = 10000
    dataset = prep.encode_dataset_column(df=dataset, field="review", use_top_words=n_top_used_words)

    # Encode target variables to binary representation
    dataset = prep.string_to_int(df=dataset, params={"sentiment": {'positive': 1, 'negative': 0}})

    # Pad all reviews, remove reviews that have no words, trim reviews that exceed the review_len value
    review_len = 500
    dataset = prep.pad_text(df=dataset, column="review_encoded", min_words=1, max_words=review_len)
time = datetime.now().strftime('%H.%M')

generated = pd.read_csv(PATH_TEXTS)
print(generated)

generated["text"] = generated["text"].apply(lambda s: ast.literal_eval(s))

pp_generated = Preprocessing(
    generated,
    date,
    time,
    analysis="text",
    news_type="generated",
    duplicate_rows_removal=False,
    lowercasing=False,
    tokenization=False,
    lemmatization=False,
    noise_removal=False,
    stemming=False,
    stopword_removal=False,
    entity_recognition=False,
    data_augmentation=False,
    word2vec=True,
    doc2vec=False,
    aggregation=True)  # here you can set the configuration

gen = pp_generated.run_pipeline()
dataframe = pd.DataFrame(gen.aggregated, columns=["text"])
dataframe["membership"] = generated["membership"]
dataset = pp_generated.shuffle(dataframe).reset_index()
dataset.columns = ["old index", "text", "membership"]
dataset.index.name = "index"