def __init__(self, random_state: int = None, known_latent_traits: Dict[str, Dict[str, float]] = None): if known_latent_traits is not None: latent_traits_calibrator = KnownParametersCalibrator(latent_traits=known_latent_traits) if set(known_latent_traits.keys()) != {DIFFICULTY, DISCRIMINATION}: raise ValueError("wrong keys in known_latent_traits dictionary") else: latent_traits_calibrator = IRTCalibrator(DIFFICULTY_RANGE, DISCRIMINATION_RANGE) vec_diff = TfidfVectorizer(stop_words='english', preprocessor=vectorizer_text_preprocessor, max_features=1000) feat_eng_regression_pipeline_difficulty = FeatureEngAndRegressionPipeline( FeatureEngineeringModule([IRFeaturesComponent(vec_diff, concatenate_correct=True, concatenate_wrong=True)]), RegressionModule([ SklearnRegressionComponent( RandomForestRegressor(n_estimators=250, max_depth=50, random_state=random_state), latent_trait_range=DIFFICULTY_RANGE ) ]) ) vec_disc = TfidfVectorizer(stop_words='english', preprocessor=vectorizer_text_preprocessor, max_features=800) feat_eng_regression_pipeline_discrimination = FeatureEngAndRegressionPipeline( FeatureEngineeringModule([IRFeaturesComponent(vec_disc, concatenate_correct=True, concatenate_wrong=True)]), RegressionModule([ SklearnRegressionComponent( RandomForestRegressor(n_estimators=100, max_depth=25, random_state=random_state), latent_trait_range=DISCRIMINATION_RANGE ) ]) ) estimator_from_text = FeatureEngAndRegressionEstimatorFromText( { DIFFICULTY: feat_eng_regression_pipeline_difficulty, DISCRIMINATION: feat_eng_regression_pipeline_discrimination } ) super().__init__(latent_traits_calibrator, estimator_from_text)
FeatureEngineeringModule([ReadabilityFeaturesComponent()]), RegressionModule([ SklearnRegressionComponent(RandomForestRegressor(random_state=SEED), latent_trait_range=B_RANGE) ])) # pipeline discrimination pipe_a = FeatureEngAndRegressionPipeline( FeatureEngineeringModule([ReadabilityFeaturesComponent()]), RegressionModule([ SklearnRegressionComponent(RandomForestRegressor(random_state=SEED), latent_trait_range=A_RANGE) ])) # create estimator from text form the previous pipelines estimator_from_text = FeatureEngAndRegressionEstimatorFromText({ DIFFICULTY: pipe_b, DISCRIMINATION: pipe_a }) model = Text2PropsModel(latent_traits_calibrator, estimator_from_text) model.calibrate_latent_traits(None) # define parameters for randomized CV dict_params = { DIFFICULTY: [{ 'regressor__n_estimators': randint(20, 200), 'regressor__max_depth': randint(2, 50) }], DISCRIMINATION: [{ 'regressor__n_estimators': randint(20, 200), 'regressor__max_depth': randint(2, 50) }],
concatenate_correct=True, concatenate_wrong=True), LinguisticFeaturesComponent(), ReadabilityFeaturesComponent(), ]), RegressionModule([ SklearnRegressionComponent(RFRegressor(n_estimators=100, max_depth=20, random_state=SEED), latent_trait_range=DISCRIMINATION_RANGE) ])) model = Text2PropsModel( KnownParametersCalibrator(dict_latent_traits), FeatureEngAndRegressionEstimatorFromText({ DIFFICULTY: pipeline_difficulty, DISCRIMINATION: pipeline_discrimination })) model.train(df_train) print('[INFO] model trained') # Here I estimate the latent traits for the test set dict_predictions_test_set = model.predict(df_test) # I have to convert the dictionary of the prediction in the right format as model.predict returns a dict of lists # (one list for each latent trait) dict_predicted_latent_traits = dict() dict_predicted_latent_traits[DIFFICULTY], dict_predicted_latent_traits[ DISCRIMINATION] = dict(), dict() for idx, q_id in enumerate(df_test[Q_ID].values): dict_predicted_latent_traits[DIFFICULTY][q_id] = dict_predictions_test_set[ DIFFICULTY][idx]