示例#1
0
 def sentenceEmbeddingFeatureGeneratorFactory(persistCache=True):
     columnGen = ColumnGeneratorSentenceEncodings("reviewText",
                                                  encodingProvider,
                                                  CACHE_PATH,
                                                  persistCache=persistCache)
     return FeatureGeneratorFromColumnGenerator(
         columnGen,
         normalisationRuleTemplate=DFTNormalisation.RuleTemplate(
             unsupported=True))
示例#2
0
def test_FeatureGeneratorNAMarker(irisClassificationTestCase):
    """
    Integration test for handling of N/A values via marker features (using FeatureGeneratorNAMarker) in the context of models
    that do not support N/A values, replacing them with a different value (using FillNA)
    """
    iodata = irisClassificationTestCase.data

    # create some random N/A values in the data set
    inputs = iodata.inputs.copy()
    rand = random.Random(42)
    fullIndices = list(range(len(inputs)))
    for col in inputs.columns:
        indices = rand.sample(fullIndices, 20)
        inputs[col].iloc[indices] = np.nan
    iodata = InputOutputData(inputs, iodata.outputs)

    for useFGNA in (True, False):
        fgs = [
            FeatureGeneratorTakeColumns(
                normalisationRuleTemplate=DFTNormalisation.RuleTemplate(
                    independentColumns=True))
        ]
        if useFGNA:
            fgs.append(FeatureGeneratorNAMarker(inputs.columns))
        fCollector = FeatureCollector(*fgs)
        model = SkLearnMLPVectorClassificationModel() \
            .withFeatureCollector(fCollector) \
            .withInputTransformers(
                DFTNormalisation(fCollector.getNormalisationRules(), defaultTransformerFactory=SkLearnTransformerFactoryFactory.StandardScaler()),
                DFTFillNA(-3))
        # NOTE: using -3 instead of 0 to fill N/A values in order to force the model to learn the purpose of the N/A markers,
        # because 0 values are actually a reasonable fallback (which happens to work) when using StandardScaler
        # NOTE: it is important to apply DFTNormalisation before DFTFillNA, because DFTNormalisation would learn using the filled values otherwise

        ev = VectorClassificationModelEvaluator(iodata, testFraction=0.2)
        ev.fitModel(model)
        result = ev.evalModel(model)
        accuracy = result.getEvalStats().getAccuracy()
        log.info(f"Accuracy (for useFGNA={useFGNA}) = {accuracy}")
        if useFGNA:
            assert accuracy > 0.85
        else:
            assert accuracy < 0.85
示例#3
0
    flattenedPandasDf: pd.DataFrame = ...  # Load/insert the flattened dataframe from a previous step
    CACHE_PATH: str = ...

    # replace by a lightweight model for lambda
    reviewClassifier = models.MultiLayerPerceptronVectorClassificationModel(
        hiddenDims=[50, 50, 20], cuda=False, epochs=300)

    # add the feature generator that was previously used to fill the cache to the model
    # encodingProvider = TextStatEncodingProvider() # for lambda
    encodingProvider = BertBaseMeanEncodingProvider()
    reviewEncodingFeatureGen = sentenceEmbeddingFeatureGeneratorFactory(
        CACHE_PATH, persistCache=False)
    encodedReviewColName = reviewEncodingFeatureGen.columnGen.generatedColumnName
    flattenedSentenceEncodingsFeatureregen = flattenedFeatureGenerator(
        reviewEncodingFeatureGen,
        normalisationRuleTemplate=DFTNormalisation.RuleTemplate(skip=True))
    reviewFeatureCollector = FeatureCollector(
        flattenedSentenceEncodingsFeatureregen)
    reviewClassifier = reviewClassifier.withFeatureCollector(
        reviewFeatureCollector)

    # split off the targets and train
    targetDf = pd.DataFrame(flattenedPandasDf.pop("overall"))
    inputOutputData = InputOutputData(flattenedPandasDf, targetDf)
    evalModelViaEvaluator(reviewClassifier,
                          inputOutputData,
                          testFraction=0.01,
                          plotTargetDistribution=True)

    # save model, load it and try predict as integration test
    with open("reviewClassifier-v1.pickle", 'wb') as f: