Python Preprocessing примеры использования

Язык программирования: Python

Пространство имен/Пакет: classes.preprocessing

Класс/Тип: Preprocessing

Примеров на hotexamples.com: 5

Python Preprocessing - 5 примеров найдено. Это лучшие примеры Python кода для classes.preprocessing.Preprocessing, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Preprocessing(5)

run_pipeline(2)

split_dataset(2)

binariseImg(1)

encode_dataset_column(1)

get(1)

make_dictionary(1)

pad_text(1)

prepare_dataset(1)

shuffle(1)

splitChars(1)

string_to_int(1)

write_preprocessed_dataset(1)

Пример #1

Показать файл

def run_preprocessing(csr: AbstractModel,
                      train_preprocessed_path,
                      test_preprocessed_path,
                      full_data=True):
    """
  Runs the preprocessing methods according to the chosen classifier
    on the train and test data

  :param csr: chosen classifier (child of AbstractModel)
  :type csr: AbstractModel
  :param train_preprocessed_path: path to load train data
  :type train_preprocessed_path: str
  :param test_preprocessed_path: path to load test data
  :type test_preprocessed_path: str
  :param full_data: if False, the small dataset (200K rows) is used
  :type full_data: bool, optional
  """

    # Read data
    if full_data:
        dataset_files = [TRAIN_DATA_NEGATIVE_FULL, TRAIN_DATA_POSITIVE_FULL]
    else:
        dataset_files = [TRAIN_DATA_NEGATIVE, TRAIN_DATA_POSITIVE]

    train_preprocessing = Preprocessing(dataset_files, submission=False)
    test_preprocessing = Preprocessing([TEST_DATA], submission=True)

    # Preprocess it
    for method in csr.get_preprocessing_methods(istest=False):
        getattr(train_preprocessing, method)()

    for method in csr.get_preprocessing_methods(istest=True):
        getattr(test_preprocessing, method)()

    # Save it
    train_df = train_preprocessing.get()
    train_df = train_df.sample(frac=1)

    train_df.to_csv(train_preprocessed_path, index=False)
    test_preprocessing.get().to_csv(test_preprocessed_path, index=False)

Пример #2

Показать файл

Файл: training_stage.py Проект: CReimer/fhws_ocr

from classes.featureExtraction import FeatureExtraction
from classes.sternmuster import Sternmuster
from classes.pca import PCA

tools = Tools()
database = Database()
database.initializeEmpty()

char_values = string.ascii_uppercase + string.ascii_lowercase  # Expected characters in Trainings Set

splitted_t_set = []

for t_set in glob.glob('./trainingdata/*.png'):
    print("Reading image: " + t_set)
    img = cv2.imread(t_set, cv2.IMREAD_GRAYSCALE)
    preprocess = Preprocessing(img)
    preprocess.binariseImg()
    splitted_chars = preprocess.splitChars()
    splitted_t_set.append(splitted_chars)

# Histogram
for i in range(len(char_values)):
    for font in splitted_t_set:
        histogram = Histogram(font[i])
        database.add(char_values[i], 'histogram', histogram.rowWert2wert())

# Pixel Average
for i in range(len(char_values)):
    for font in splitted_t_set:
        pix_av = FeatureExtraction(font[i])
        database.add(char_values[i], 'pixelAv', pix_av.getpixelaverage())

Пример #3

Показать файл

fake = dataset_fake[analysis]
true = dataset_true[analysis]

path = "preprocessed_datasets/final_other_dataset.csv"

print("")
print("PREPROCESSING:")
print("")
''' FAKE NEWS DATASET '''

print("INPUT:")
print("(TYPE: ", type(fake), ")")
print(fake.head(10))

preprocesser_fake = Preprocessing(
    fake, date, time, analysis=analysis, news_type="fake",
    language="es")  # here you can set the configuration
data_fake = preprocesser_fake.run_pipeline()
print("")
print("FINAL OUTPUT:")

if preprocesser_fake.aggregation:
    print("(TYPE: ", type(data_fake.aggregated), ")")
    print(data_fake.aggregated)

else:
    print("(TYPE: ", type(data_fake.docvectors), ")")
    print(data_fake.docvectors)
''' REAL NEWS DATASET '''

print("INPUT:")

Пример #4

Показать файл

Файл: the_best_saved_model.py Проект: mellowpixel/DeepLearning

from classes.preprocessing import Preprocessing
from classes.stats import Stats


if __name__ == '__main__':
    
    # ************************************************** #
    #           PREPROCESSING OF THE DATASET             #
    # ************************************************** #

    dir = os.path.dirname(os.path.realpath(__file__))
    dataset = pd.read_csv(dir + '/dataset/IMDBDataset.csv')
    
    stats = Stats()
    prep = Preprocessing(dataset)
    
    # Make a dictionary by tokenizing all words in the dataset
    prep.make_dictionary()
    
    # Encode all words with integer IDs
    # Encode only the most used words in the dataset, any other words encode as 0
    n_top_used_words = 10000
    dataset = prep.encode_dataset_column(df=dataset, field="review", use_top_words=n_top_used_words)

    # Encode target variables to binary representation
    dataset = prep.string_to_int(df=dataset, params={"sentiment": {'positive': 1, 'negative': 0}})

    # Pad all reviews, remove reviews that have no words, trim reviews that exceed the review_len value
    review_len = 500
    dataset = prep.pad_text(df=dataset, column="review_encoded", min_words=1, max_words=review_len)

Пример #5

Показать файл

Файл: main_use_preprocessed.py Проект: iamgiolaga/fakedetector

time = datetime.now().strftime('%H.%M')

generated = pd.read_csv(PATH_TEXTS)
print(generated)

generated["text"] = generated["text"].apply(lambda s: ast.literal_eval(s))

pp_generated = Preprocessing(
    generated,
    date,
    time,
    analysis="text",
    news_type="generated",
    duplicate_rows_removal=False,
    lowercasing=False,
    tokenization=False,
    lemmatization=False,
    noise_removal=False,
    stemming=False,
    stopword_removal=False,
    entity_recognition=False,
    data_augmentation=False,
    word2vec=True,
    doc2vec=False,
    aggregation=True)  # here you can set the configuration

gen = pp_generated.run_pipeline()
dataframe = pd.DataFrame(gen.aggregated, columns=["text"])
dataframe["membership"] = generated["membership"]
dataset = pp_generated.shuffle(dataframe).reset_index()
dataset.columns = ["old index", "text", "membership"]
dataset.index.name = "index"