def test_trees(self): (train, label) = get_X_y(train_file, label_column, sep=',') (test, label1) = get_X_y(test_file, label_column, sep=',') pipeline = Pipeline([OneHotVectorizer() << categorical_columns, FastTreesBinaryClassifier()]) pipeline.fit(train, label) out_data = pipeline.predict(test) check_accuracy(test_file, label_column, out_data, 0.65)
def test_trees_file(self): pipeline = Pipeline([OneHotVectorizer() << categorical_columns, FastTreesBinaryClassifier() << { 'Label': label_column}]) train_stream = FileDataStream(train_file, schema=file_schema) pipeline.fit(train_stream, label_column) test_stream = FileDataStream(test_file, schema=file_schema) out_data = pipeline.predict(test_stream) check_accuracy(test_file, label_column, out_data, 0.65)
def test_linear_with_train_test_schema(self): (train, label) = get_X_y(train_file, label_column, sep=',') (test, label1) = get_X_y(test_file, label_column, sep=',') pipeline = Pipeline([OneHotVectorizer() << categorical_columns, FastLinearBinaryClassifier(train_threads=1, shuffle=False)]) pipeline.fit(train, label) out_data = pipeline.predict(test) check_accuracy(test_file, label_column, out_data, 0.65)
def test_linear_file_role(self): pipeline = Pipeline([OneHotVectorizer() << categorical_columns, FastLinearBinaryClassifier(train_threads=1, shuffle=False)]) train_stream = FileDataStream(train_file, schema=file_schema) train_stream._set_role('Label', label_column) pipeline.fit(train_stream) test_stream = FileDataStream(test_file, schema=file_schema) out_data = pipeline.predict(test_stream) check_accuracy(test_file, label_column, out_data, 0.65)
def test_linear_file(self): pipeline = Pipeline([OneHotVectorizer() << categorical_columns, FastLinearBinaryClassifier(train_threads=1, shuffle=False)]) train_stream = FileDataStream(train_file, schema=file_schema) assert 'sep' in train_stream.schema.options assert 'header' in train_stream.schema.options pipeline.fit(train_stream, label_column) test_stream = FileDataStream(test_file, schema=file_schema) out_data = pipeline.predict(test_stream) check_accuracy(test_file, label_column, out_data, 0.65)