def test_MLPClassifier(irisDataSet, irisClassificationTestCase, testResources): featureNames = irisDataSet.getInputOutputData().inputs.columns dftNorm = DFTNormalisation([DFTNormalisation.Rule(re.escape(f)) for f in featureNames], defaultTransformerFactory=sklearn.preprocessing.StandardScaler) model = sensai.torch.models.MultiLayerPerceptronVectorClassificationModel(hiddenDims=(50,25,8), cuda=False, epochs=100, optimiser="adam", batchSize=200, normalisationMode=NormalisationMode.NONE, hidActivationFunction=torch.tanh) \ .withName("torchMLPClassifier") \ .withInputTransformers([dftNorm]) \ .withFeatureGenerator(FeatureGeneratorTakeColumns()) irisClassificationTestCase.testMinAccuracy(model, 0.8)
def test_multiColumnSingleRule(self): arr = np.array([1, 5, 10]) df = pd.DataFrame({"foo": arr, "bar": arr * 100}) dft = DFTNormalisation([ DFTNormalisation.Rule( r"foo|bar", transformer=sklearn.preprocessing.MaxAbsScaler(), independentColumns=False) ]) df2 = dft.fitApply(df) assert np.all(df2.foo == arr / 1000) and np.all(df2.bar == arr / 10)
def test_arrayValued(self): arr = np.array([1, 5, 10]) df = pd.DataFrame({"foo": [arr, 2 * arr, 10 * arr]}) dft = DFTNormalisation([ DFTNormalisation.Rule( r"foo|bar", transformer=sklearn.preprocessing.MaxAbsScaler(), arrayValued=True) ]) df2 = dft.fitApply(df) assert np.all(df2.foo.iloc[0] == arr / 100) and np.all(df2.foo.iloc[-1] == arr / 10)
def sentenceEmbeddingFeatureGeneratorFactory(persistCache=True): columnGen = ColumnGeneratorSentenceEncodings("reviewText", encodingProvider, CACHE_PATH, persistCache=persistCache) return FeatureGeneratorFromColumnGenerator( columnGen, normalisationRuleTemplate=DFTNormalisation.RuleTemplate( unsupported=True))
def test_FeatureGeneratorNAMarker(irisClassificationTestCase): """ Integration test for handling of N/A values via marker features (using FeatureGeneratorNAMarker) in the context of models that do not support N/A values, replacing them with a different value (using FillNA) """ iodata = irisClassificationTestCase.data # create some random N/A values in the data set inputs = iodata.inputs.copy() rand = random.Random(42) fullIndices = list(range(len(inputs))) for col in inputs.columns: indices = rand.sample(fullIndices, 20) inputs[col].iloc[indices] = np.nan iodata = InputOutputData(inputs, iodata.outputs) for useFGNA in (True, False): fgs = [ FeatureGeneratorTakeColumns( normalisationRuleTemplate=DFTNormalisation.RuleTemplate( independentColumns=True)) ] if useFGNA: fgs.append(FeatureGeneratorNAMarker(inputs.columns)) fCollector = FeatureCollector(*fgs) model = SkLearnMLPVectorClassificationModel() \ .withFeatureCollector(fCollector) \ .withInputTransformers( DFTNormalisation(fCollector.getNormalisationRules(), defaultTransformerFactory=SkLearnTransformerFactoryFactory.StandardScaler()), DFTFillNA(-3)) # NOTE: using -3 instead of 0 to fill N/A values in order to force the model to learn the purpose of the N/A markers, # because 0 values are actually a reasonable fallback (which happens to work) when using StandardScaler # NOTE: it is important to apply DFTNormalisation before DFTFillNA, because DFTNormalisation would learn using the filled values otherwise ev = VectorClassificationModelEvaluator(iodata, testFraction=0.2) ev.fitModel(model) result = ev.evalModel(model) accuracy = result.getEvalStats().getAccuracy() log.info(f"Accuracy (for useFGNA={useFGNA}) = {accuracy}") if useFGNA: assert accuracy > 0.85 else: assert accuracy < 0.85
flattenedPandasDf: pd.DataFrame = ... # Load/insert the flattened dataframe from a previous step CACHE_PATH: str = ... # replace by a lightweight model for lambda reviewClassifier = models.MultiLayerPerceptronVectorClassificationModel( hiddenDims=[50, 50, 20], cuda=False, epochs=300) # add the feature generator that was previously used to fill the cache to the model # encodingProvider = TextStatEncodingProvider() # for lambda encodingProvider = BertBaseMeanEncodingProvider() reviewEncodingFeatureGen = sentenceEmbeddingFeatureGeneratorFactory( CACHE_PATH, persistCache=False) encodedReviewColName = reviewEncodingFeatureGen.columnGen.generatedColumnName flattenedSentenceEncodingsFeatureregen = flattenedFeatureGenerator( reviewEncodingFeatureGen, normalisationRuleTemplate=DFTNormalisation.RuleTemplate(skip=True)) reviewFeatureCollector = FeatureCollector( flattenedSentenceEncodingsFeatureregen) reviewClassifier = reviewClassifier.withFeatureCollector( reviewFeatureCollector) # split off the targets and train targetDf = pd.DataFrame(flattenedPandasDf.pop("overall")) inputOutputData = InputOutputData(flattenedPandasDf, targetDf) evalModelViaEvaluator(reviewClassifier, inputOutputData, testFraction=0.01, plotTargetDistribution=True) # save model, load it and try predict as integration test with open("reviewClassifier-v1.pickle", 'wb') as f: