]]

# Visualizando o dataset:
df.describe()

# Preenchendo os valores númericos nulos (NA) com a mediana.
df = df.fillna(df.median())

df.head(5)

# Definindo as variáveis dependentes/independentes
X = df[df.columns[~df.columns.isin(['SalePrice'])]].values
y = df['SalePrice'].values.reshape(-1, 1)

# Normalização das features:
X = feature_scaling(X)

# Inserindo uma coluna preenchida com valores 1 no começo da matriz de feature para que seja realizado os cálculos necessários.
X = np.append(arr=np.ones((1460, 1)).astype(int), values=X, axis=1)

# Dividindo os dados
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

X_train[1:5, :5]

################### Foward Elimination: ###################
# Esse processo é realizado através de uma análise incremental da contribuição das features ao modelo final.
# Portanto, a cada iteração é adicionada uma feature que deverá ser analisada seu impacto no modelo através do *p-value*.
예제 #2
0
def perform_feature_analysis(train_tokens, train_pos, train_labels,
                             test_tokens, test_pos, test_labels, feature_set,
                             features_names, activation_list,
                             feature_function):

    if feature_set is not 'topic':
        # Get all the train and test features, each in their own dictionary of feature names:feature values
        all_train_features = collect_features(train_tokens, train_pos,
                                              feature_set, feature_function)
        all_test_features = collect_features(test_tokens, test_pos,
                                             feature_set, feature_function)
        """
        # This is in case we want to run over all possible combinations of features, 
        # and not using the activation_list argument (not done here as it is obv huge to run through all 2^|features|)
        combinations_of_features = len(_features_names)
        activation_list = list(itertools.product([True, False], repeat=combinations_of_features))
        activation_list = activation_list[:-1]      # exclude the option when all activations are false
        """
        for activation in activation_list:
            # Print the current grid of selected features
            utils.print_features(activation, features_names)

            # Select the active features, used in current analysis
            selected_features = [
                features_names[i] for i in range(0, len(features_names))
                if activation[i] is True
            ]
            active_train_features = select_active_features(
                all_train_features, selected_features)
            active_test_features = select_active_features(
                all_test_features, selected_features)

            # Convert feature dictionary to a list of feature values (a dict vectorizer)
            train_features, test_features = utils.extract_features_from_dict(
                active_train_features, active_test_features)

            # Scale the features
            scaled_train_features = utils.feature_scaling(train_features)
            scaled_test_features = utils.feature_scaling(test_features)

            # Run the models
            utils.run_supervised_learning_models(scaled_train_features,
                                                 train_labels,
                                                 scaled_test_features,
                                                 test_labels)
    else:
        for activation in activation_list:
            # Print the current grid of selected features
            utils.print_features(activation, features_names)

            # Build the topic LDA model
            dictionary, corpus, lda_model = \
                extract_feature.build_lda_model(train_tokens, train_pos,
                                                use_nouns=activation[2], use_verbs=activation[3], use_all=activation[4],
                                                num_of_topics=activation[0], passes=activation[1], verbose=False)
            # Get all the train and test features, each in their own dictionary of feature names:feature values
            train_features = []
            for index in range(len(train_tokens)):
                this_tweet_features = perform_function(
                    extract_feature.get_topic_features, corpus, lda_model,
                    index)
                train_features.append({**this_tweet_features})
            test_features = []
            for token, pos in zip(test_tokens, test_pos):
                this_tweet_features = perform_function(
                    extract_feature.get_topic_features_for_unseen_tweet,
                    dictionary, lda_model, token.split(), pos.split(),
                    activation[2], activation[3], activation[4])
                test_features.append({**this_tweet_features})

            # Convert feature dictionary to a list of feature values (a dict vectorizer)
            train_features, test_features = utils.extract_features_from_dict(
                train_features, test_features)

            # Run the models
            utils.run_supervised_learning_models(train_features, train_labels,
                                                 test_features, test_labels)
# Preechendo os valores nulas com a mediana
df = df.fillna(df.median())
# Exibindo algumas das linhas tinham valores nulos via indíces:
df.iloc[[32, 126, 374], ]

# Definindo as variáveis dependentes/independentes.
X = df.iloc[:, 1:8]
y = df.iloc[:, 0]

# Dividindo o dataset em conjunto de treinamento e testes
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

# Normalização das features
X_train = feature_scaling(X_train)
X_test = feature_scaling(X_test)

# Treinando o modelo de regressão com o conjunto de treinamento
regressor = DecisionTreeRegressor(random_state=0)
regressor.fit(X_train, y_train)

# Prevendo os resultados com o conjunto de testes
y_pred = regressor.predict(X_test)

# Avaliando o modelo com a métrica r2
regressor.score(X_test, y_test)

# Avaliando o modelo com a métrica rmse
mean_squared_error(y_test, y_pred)
 def feature_scaling(self):
     self.T, self.feature_scaling_coeficient = utils.feature_scaling(self.T)
     self.training_features = np.matrix.transpose(self.T)