def test_unit(self): column = "test_column" test_data = pandas.DataFrame({column: ["Test:String","Test.St.ri.ng","Tes,t:String", "TestSt::ring!"]}) verify_data = pandas.DataFrame({column: ["TestString","TestString","TestString", "TestString"]}) test_featureset = Dataframe() verify_featureset = Dataframe() test_featureset.set_dataframe(test_data) verify_featureset.set_dataframe(verify_data) visitor = RemovePunctuation(column) visitor.visit(test_featureset) test.assert_frame_equal(test_featureset.get_dataframe(), verify_featureset.get_dataframe())
def test_unit(self): column = "test_column" test_data = pandas.DataFrame({column: ["maximum","presumably","multiply", "provision"]}) verify_data = pandas.DataFrame({column: ["maxim","presum","multiply", "provid"]}) test_featureset = Dataframe() verify_featureset = Dataframe() test_featureset.set_dataframe(test_data) verify_featureset.set_dataframe(verify_data) visitor = Stem(column) visitor.visit(test_featureset) test.assert_frame_equal(test_featureset.get_dataframe(), verify_featureset.get_dataframe())
def test_unit(self): column = "test_column" join_string = "!wow!" test_data = pandas.DataFrame({column: ["TestString", ["Test", "String"], ""]}) verify_data = pandas.DataFrame({column: ["T!wow!e!wow!s!wow!t!wow!S!wow!t!wow!r!wow!i!wow!n!wow!g", "Test!wow!String", ""]}) test_featureset = Dataframe() verify_featureset = Dataframe() test_featureset.set_dataframe(test_data) verify_featureset.set_dataframe(verify_data) visitor = Join(column, join_string) visitor.visit(test_featureset) test.assert_frame_equal(test_featureset.get_dataframe(), verify_featureset.get_dataframe())
def test_unit(self): column = "test_column" char = "s" test_data = pandas.DataFrame({column: ["TestString", "super", "wow"]}) verify_data = pandas.DataFrame({column: ["TetString", "uper", "wow"]}) test_featureset = Dataframe() verify_featureset = Dataframe() test_featureset.set_dataframe(test_data) verify_featureset.set_dataframe(verify_data) visitor = RemoveChar(column, char) visitor.visit(test_featureset) test.assert_frame_equal(test_featureset.get_dataframe(), verify_featureset.get_dataframe())
def test_unit(self): column = "test_column" test_data = pandas.DataFrame( {column: ["dogs", "churches", "aardwolves", "abaci"]}) verify_data = pandas.DataFrame( {column: ["dogs", "churches", "aardwolves", "abaci"]}) test_featureset = Dataframe() verify_featureset = Dataframe() test_featureset.set_dataframe(test_data) verify_featureset.set_dataframe(verify_data) visitor = Lemmatize(column) visitor.visit(test_featureset) test.assert_frame_equal(test_featureset.get_dataframe(), verify_featureset.get_dataframe())
def test_unit(self): column = "test_column" n_gram_value = 3 test_data = pandas.DataFrame({column: ["12345"]}) verify_data = pandas.DataFrame( {column: [[("1", "2", "3"), ("2", "3", "4"), ("3", "4", "5")]]}) test_featureset = Dataframe() verify_featureset = Dataframe() test_featureset.set_dataframe(test_data) verify_featureset.set_dataframe(verify_data) visitor = NGram(column, n_gram_value) visitor.visit(test_featureset) test.assert_frame_equal(test_featureset.get_dataframe(), verify_featureset.get_dataframe())
def test_unit(self): column = "test_column" test_data = pandas.DataFrame({ column: ["TestString", "TESTSTRING", "teststring", "TestString0!"] }) verify_data = pandas.DataFrame({ column: ["teststring", "teststring", "teststring", "teststring0!"] }) test_featureset = Dataframe() verify_featureset = Dataframe() test_featureset.set_dataframe(test_data) verify_featureset.set_dataframe(verify_data) visitor = LowerCase(column) visitor.visit(test_featureset) test.assert_frame_equal(test_featureset.get_dataframe(), verify_featureset.get_dataframe())
def maxabsscaler(table: str, column: str, copy: bool = True): dataframe = Dataframe() download = LoadFromDatabase(table=table, column=column) upload = LoadToDatabase(table=table, column=column) scaler = MaxAbsScaler(copy) dataframe = download.visit(dataframe) dataframe = scaler.visit(dataframe) upload.visit(dataframe)
def labelencode(table: str, column: str, mode: str = "shuffle"): dataframe = Dataframe() download = LoadFromDatabase(table=table, column=column) upload = LoadToDatabase(table=table, column=column) encode = LabelEncoder(mode, column) dataframe = download.visit(dataframe) dataframe = encode.visit(dataframe) upload.visit(dataframe)
def join(table: str, column: str, char: str): dataframe = Dataframe() download = LoadFromDatabase(table=table, column=column) upload = LoadToDatabase(table=table, column=column) join = JoinOperation(value=char) dataframe = download.visit(dataframe) dataframe = join.visit(dataframe) upload.visit(dataframe)
def interpolate(table: str, column: str, method: str = "linear"): dataframe = Dataframe() download = LoadFromDatabase(table=table, column=column) upload = LoadToDatabase(table=table, column=column) interpolate = Interpolate(method) dataframe = download.visit(dataframe) dataframe = interpolate.visit(dataframe) upload.visit(dataframe)
def mean_word(table: str, column: str): dataframe = Dataframe() download = LoadFromDatabase(table=table, column=column) upload = LoadToDatabase(table=table, column=column) word = MeanWord(column=column) dataframe = download.visit(dataframe) dataframe = word.visit(dataframe) upload.visit(dataframe)
def condense_simple(table: str, column: str, numeric_feature: str = "median"): dataframe = Dataframe() download = LoadFromDatabase(table=table, column=column) upload = LoadToDatabase(table=table, column=column) condense_simple = CondenseSimple(column, numeric_feature) dataframe = download.visit(dataframe) dataframe = condense_simple.visit(dataframe) upload.visit(dataframe)
def ngram(table: str, column: str, value: int): dataframe = Dataframe() download = LoadFromDatabase(table=table, column=column) upload = LoadToDatabase(table=table, column=column) ngram = NGram(n_gram_value=value) dataframe = download.visit(dataframe) dataframe = ngram.visit(dataframe) upload.visit(dataframe)
def remove_punctuation(table: str, column: str): dataframe = Dataframe() download = LoadFromDatabase(table=table, column=column) upload = LoadToDatabase(table=table, column=column) remove = RemovePunctuation() dataframe = download.visit(dataframe) dataframe = remove.visit(dataframe) upload.visit(dataframe)
def lemmatize(table: str, column: str): dataframe = Dataframe() download = LoadFromDatabase(table=table, column=column) upload = LoadToDatabase(table=table, column=column) lemmatize = Lemmatizer() dataframe = download.visit(dataframe) dataframe = lemmatize.visit(dataframe) upload.visit(dataframe)
def stem(table: str, column: str): dataframe = Dataframe() download = LoadFromDatabase(table=table, column=column) upload = LoadToDatabase(table=table, column=column) stem = Stemmer() dataframe = download.visit(dataframe) dataframe = stem.visit(dataframe) upload.visit(dataframe)
def text_binary(table: str, column: str): dataframe = Dataframe() download = LoadFromDatabase(table=table, column=column) upload = LoadToDatabase(table=table, column=column) binary = TextToBinary(column=column) dataframe = download.visit(dataframe) dataframe = binary.visit(dataframe) upload.visit(dataframe)
def uppercase(table: str, column: str): dataframe = Dataframe() download = LoadFromDatabase(table=table, column=column) upload = LoadToDatabase(table=table, column=column) uppercase = UpperCase() dataframe = download.visit(dataframe) dataframe = uppercase.visit(dataframe) upload.visit(dataframe)
def normalize(table: str, column: str): dataframe = Dataframe() download = LoadFromDatabase(table=table, column=column) upload = LoadToDatabase(table=table, column=column) normalize = Normalizer() dataframe = download.visit(dataframe) dataframe = normalize.visit(dataframe) upload.visit(dataframe)
def sort(table: str, column: str, mode: str = "shuffle"): dataframe = Dataframe() download = LoadFromDatabase(table=table, column=column) upload = LoadToDatabase(table=table, column=column) sort = Sort(mode, column) dataframe = download.visit(dataframe) dataframe = sort.visit(dataframe) upload.visit(dataframe)
def fillempty(table: str, column: str, feature_type: str, value: float): dataframe = Dataframe() download = LoadFromDatabase(table=table, column=column) upload = LoadToDatabase(table=table, column=column) fillempty = FillEmptyCells(column, feature_type, value) dataframe = download.visit(dataframe) dataframe = fillempty.visit(dataframe) upload.visit(dataframe)
def character_sum(table: str, column: str): dataframe = Dataframe() download = LoadFromDatabase(table=table, column=column) upload = LoadToDatabase(table=table, column=column) sum = CharacterSum(column=column) dataframe = download.visit(dataframe) dataframe = sum.visit(dataframe) upload.visit(dataframe)
def remove_character(table: str, column: str, char: str): dataframe = Dataframe() download = LoadFromDatabase(table=table, column=column) upload = LoadToDatabase(table=table, column=column) remove = RemoveChar(char=char) dataframe = download.visit(dataframe) dataframe = remove.visit(dataframe) upload.visit(dataframe)
def mask(table: str, column: str, condition: str): dataframe = Dataframe() download = LoadFromDatabase(table=table, column=column) upload = LoadToDatabase(table=table, column=column) mask = Mask(condition, column) dataframe = download.visit(dataframe) dataframe = mask.visit(dataframe) upload.visit(dataframe)
def start_number(table: str, column: str): dataframe = Dataframe() download = LoadFromDatabase(table=table, column=column) upload = LoadToDatabase(table=table, column=column) number = StartWithNumber(column=column) dataframe = download.visit(dataframe) dataframe = number.visit(dataframe) upload.visit(dataframe)
def test_unit(self): column = "test_column" test_data = pandas.DataFrame({ column: [ "Test String", "anothertest string", "Another test string", "test" ] }) verify_data = pandas.DataFrame({ column: [["Test", "String"], ["anothertest", "string"], ["Another", "test", "string"], ["test"]] }) test_featureset = Dataframe() verify_featureset = Dataframe() test_featureset.set_dataframe(test_data) verify_featureset.set_dataframe(verify_data) visitor = Tokenizer(column) visitor.visit(test_featureset) test.assert_frame_equal(test_featureset.get_dataframe(), verify_featureset.get_dataframe())
def test_unit(self): column = "test_column" column_1 = "second_column" condition = "featureset > 3" test_data = pandas.DataFrame({ column: [1, 2, 3, 4, 5], column_1: [5, 4, 3, 2, 1] }) verify_data = pandas.DataFrame({ column: [1, 2, 3, np.NaN, np.NaN], column_1: [np.NaN, np.NaN, 3, 2, 1] }) test_featureset = Dataframe() verify_featureset = Dataframe() test_featureset.set_dataframe(test_data) verify_featureset.set_dataframe(verify_data) visitor = Mask(condition) visitor.visit(test_featureset) test.assert_frame_equal(test_featureset.get_dataframe(), verify_featureset.get_dataframe())
def minmaxscaler(table: str, column: str, feature_range: bytearray = (0, 1), copy: bool = True): dataframe = Dataframe() download = LoadFromDatabase(table=table, column=column) upload = LoadToDatabase(table=table, column=column) scaler = MinMaxScaler(feature_range, copy) dataframe = download.visit(dataframe) dataframe = scaler.visit(dataframe) upload.visit(dataframe)
def split(table: str, column: str, id_split: dict, mode: str = "sequential"): dataframe = Dataframe() download = LoadFromDatabase(table=table, column=column) upload = LoadToDatabase(table=table, column=column) split = Split(id_split, mode) dataframe = download.visit(dataframe) # Return Dict of Feature sets # {name, data} dataframes = split.visit(dataframe) upload.visit(dataframes)