示例#1
0
# load data - TODO: in order to run this script you have to provide your own data
df_gte = pd.read_csv(os.path.join(DATA_PATH, 'a_gte.csv'))
df_train = pd.read_csv(os.path.join(DATA_PATH, 'q_train.csv'))
df_test = pd.read_csv(os.path.join(DATA_PATH, 'q_test.csv'))
df_test = df_test.drop(df_test.head(100).index)  # Not to use the validation data used in 5.1 for model selection
dict_latent_traits = pickle.load(open(os.path.join(DATA_PATH, 'known_latent_traits.p'), "rb"))

# define latent traits calibrator (known latent traits)
latent_traits_calibrator = KnownParametersCalibrator(dict_latent_traits)

file = open("outputs/5_3_read_ling.txt", 'w')

# pipeline difficulty
pipe_b = FeatureEngAndRegressionPipeline(
    FeatureEngineeringModule([ReadabilityFeaturesComponent(), LinguisticFeaturesComponent()]),
    RegressionModule([SklearnRegressionComponent(RandomForestRegressor(random_state=SEED), latent_trait_range=B_RANGE)])
)
# pipeline discrimination
pipe_a = FeatureEngAndRegressionPipeline(
    FeatureEngineeringModule([ReadabilityFeaturesComponent(), LinguisticFeaturesComponent()]),
    RegressionModule([SklearnRegressionComponent(RandomForestRegressor(random_state=SEED), latent_trait_range=A_RANGE)])
)
# create estimator from text form the previous pipelines
estimator_from_text = FeatureEngAndRegressionEstimatorFromText({DIFFICULTY: pipe_b, DISCRIMINATION: pipe_a})
model = Text2PropsModel(latent_traits_calibrator, estimator_from_text)
model.calibrate_latent_traits(None)

# define parameters for randomized CV
dict_params = {
    DIFFICULTY: [{'regressor__n_estimators': randint(20, 200), 'regressor__max_depth': randint(2, 50)}],
示例#2
0
df_train = pd.read_csv(os.path.join(DATA_PATH, 'q_train.csv'))
df_test = pd.read_csv(os.path.join(DATA_PATH, 'q_test.csv'))
df_test = df_test.drop(
    df_test.head(100).index
)  # Not to use the validation data used in 5.1 for model selection
dict_latent_traits = pickle.load(
    open(os.path.join(DATA_PATH, 'known_latent_traits.p'), "rb"))

# define latent traits calibrator (known latent traits)
latent_traits_calibrator = KnownParametersCalibrator(dict_latent_traits)

file = open("outputs/5_3_ling.txt", 'w')

# pipeline difficulty
pipe_b = FeatureEngAndRegressionPipeline(
    FeatureEngineeringModule([LinguisticFeaturesComponent()]),
    RegressionModule([
        SklearnRegressionComponent(RandomForestRegressor(random_state=SEED),
                                   latent_trait_range=B_RANGE)
    ]))
# pipeline discrimination
pipe_a = FeatureEngAndRegressionPipeline(
    FeatureEngineeringModule([LinguisticFeaturesComponent()]),
    RegressionModule([
        SklearnRegressionComponent(RandomForestRegressor(random_state=SEED),
                                   latent_trait_range=A_RANGE)
    ]))
# create estimator from text form the previous pipelines
estimator_from_text = FeatureEngAndRegressionEstimatorFromText({
    DIFFICULTY:
    pipe_b,
示例#3
0
# define latent traits calibrator (known latent traits)
latent_traits_calibrator = KnownParametersCalibrator(dict_latent_traits)

for min_df in np.arange(0.00, 0.11, 0.02):
    for max_df in np.arange(0.90, 1.01, 0.02):

        file = open("outputs/5_1_model_selection_RF_mindf_%.2f_maxdf_%.2f.txt" % (min_df, max_df), 'w')
        file.write("MIN_DF = %.2f - MAX DF = %.2f" % (min_df, max_df))

        # pipeline difficulty
        vec_b = TfidfVectorizer(stop_words='english', preprocessor=preproc, min_df=min_df, max_df=max_df)
        pipe_b = FeatureEngAndRegressionPipeline(
            FeatureEngineeringModule([
                IRFeaturesComponent(vec_b, concatenate_correct=True, concatenate_wrong=True),
                LinguisticFeaturesComponent(),
                ReadabilityFeaturesComponent(),
            ]),
            RegressionModule([
                SklearnRegressionComponent(RandomForestRegressor(random_state=SEED), latent_trait_range=B_RANGE)
            ])
        )
        # pipeline discrimination
        vec_a = TfidfVectorizer(stop_words='english', preprocessor=preproc, min_df=min_df, max_df=max_df)
        pipe_a = FeatureEngAndRegressionPipeline(
            FeatureEngineeringModule([
                IRFeaturesComponent(vec_a, concatenate_correct=True, concatenate_wrong=True),
                LinguisticFeaturesComponent(),
                ReadabilityFeaturesComponent(),
            ]),
            RegressionModule([