class Pipeline(BaseEstimator, TransformerMixin): """ """ def __init__(self, numeric, id=None, target=None, categorical=None, verbose=0): self.created_features = None self.id = id self.target = target self.categorical = categorical self.numeric = numeric self.verbose = verbose self.feature_generator = None self.preprocessor = None def fit_transform(self, df, y=None, **fit_params): with Timer('pipelines.Pipeline.fit_transform:', self.verbose): self.feature_generator = FeatureGenerator( id=self.id, numeric=self.numeric, categorical=self.categorical, target=self.target, verbose=self.verbose, ) df_features = self.feature_generator.fit_transform(df) self.preprocessor = Preprocessor( id=self.id, numeric=self.numeric, categorical=self.categorical, target=self.target, verbose=self.verbose, ) x = self.preprocessor.fit_transform(df_features) return x def transform(self, df): with Timer('pipelines.Pipeline.transform:', self.verbose): if self.feature_generator is None: raise NotFittedError( f'feature_generator = {self.feature_generator}') if self.preprocessor is None: raise NotFittedError(f'preprocessor = {self.preprocessor}') df_features = self.feature_generator.transform(df) x = self.preprocessor.transform(df_features) return x def fit(self, x, y=None, **fit_params): return self def get_feature_names(self): return self.created_features
predict_df = predict_df.explode(data_columns) predict_df = predict_df.reset_index(drop=True) predict_df = predict_df.reset_index(drop=False) ## do the preprocessing print("Preprocess") preprocessor = Preprocessor( doLower=args["doLower"], doLemmatization=args["doLemmatization"], removeStopWords=args["removeStopWords"], doSpellingCorrection=args["doSpellingCorrection"], removeNewLine=args["removeNewLine"], removePunctuation=args["removePunctuation"], removeHtmlTags=args["removeHtmlTags"], minTextLength=args["minTextLength"]) predict_df["processed"] = preprocessor.fit_transform( predict_df["text_german"]) predict_df = predict_df.dropna(subset=["processed"], axis=0) print("Tokenize") tokenizer = Tokenizer(tokenizeStr=preperation_technique, ngram=preperation_ngram, fasttextFile=args["fasttext_file"], doLower=args["doLower"]) predict_df["processed"] = tokenizer.fit_transform(predict_df["processed"]) ## for testing purposes #train_df = train_df.sample(100) #val_df = val_df.sample(20) #test_df = test_df.sample(20) ## apply the model
logging.error("vaidation_split needs to be given.") sys.exit("vaidation_split needs to be given.") ## get data and train columns data_column = list(set(train_df.columns) - set(args["targets"]))[0] ## do the preprocessing print("Preprocess") preprocessor = Preprocessor( doLower=args["doLower"], doLemmatization=args["doLemmatization"], removeStopWords=args["removeStopWords"], doSpellingCorrection=args["doSpellingCorrection"], removeNewLine=args["removeNewLine"], removePunctuation=args["removePunctuation"]) train_df[data_column] = preprocessor.fit_transform( train_df[data_column]) val_df[data_column] = preprocessor.transform(val_df[data_column]) test_df[data_column] = preprocessor.transform(test_df[data_column]) ## save the preprocessed data if not os.path.exists(os.path.join(args["data_path"], "temp")): os.makedirs(os.path.join(args["data_path"], "temp")) train_df.to_pickle(train_pre_path) val_df.to_pickle(val_pre_path) test_df.to_pickle(test_pre_path) else: train_df = pd.read_pickle(train_pre_path) val_df = pd.read_pickle(val_pre_path) test_df = pd.read_pickle(test_pre_path) ## get data and train columns data_column = list(set(train_df.columns) - set(args["targets"]))[0]