]] # Visualizando o dataset: df.describe() # Preenchendo os valores númericos nulos (NA) com a mediana. df = df.fillna(df.median()) df.head(5) # Definindo as variáveis dependentes/independentes X = df[df.columns[~df.columns.isin(['SalePrice'])]].values y = df['SalePrice'].values.reshape(-1, 1) # Normalização das features: X = feature_scaling(X) # Inserindo uma coluna preenchida com valores 1 no começo da matriz de feature para que seja realizado os cálculos necessários. X = np.append(arr=np.ones((1460, 1)).astype(int), values=X, axis=1) # Dividindo os dados X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) X_train[1:5, :5] ################### Foward Elimination: ################### # Esse processo é realizado através de uma análise incremental da contribuição das features ao modelo final. # Portanto, a cada iteração é adicionada uma feature que deverá ser analisada seu impacto no modelo através do *p-value*.
def perform_feature_analysis(train_tokens, train_pos, train_labels, test_tokens, test_pos, test_labels, feature_set, features_names, activation_list, feature_function): if feature_set is not 'topic': # Get all the train and test features, each in their own dictionary of feature names:feature values all_train_features = collect_features(train_tokens, train_pos, feature_set, feature_function) all_test_features = collect_features(test_tokens, test_pos, feature_set, feature_function) """ # This is in case we want to run over all possible combinations of features, # and not using the activation_list argument (not done here as it is obv huge to run through all 2^|features|) combinations_of_features = len(_features_names) activation_list = list(itertools.product([True, False], repeat=combinations_of_features)) activation_list = activation_list[:-1] # exclude the option when all activations are false """ for activation in activation_list: # Print the current grid of selected features utils.print_features(activation, features_names) # Select the active features, used in current analysis selected_features = [ features_names[i] for i in range(0, len(features_names)) if activation[i] is True ] active_train_features = select_active_features( all_train_features, selected_features) active_test_features = select_active_features( all_test_features, selected_features) # Convert feature dictionary to a list of feature values (a dict vectorizer) train_features, test_features = utils.extract_features_from_dict( active_train_features, active_test_features) # Scale the features scaled_train_features = utils.feature_scaling(train_features) scaled_test_features = utils.feature_scaling(test_features) # Run the models utils.run_supervised_learning_models(scaled_train_features, train_labels, scaled_test_features, test_labels) else: for activation in activation_list: # Print the current grid of selected features utils.print_features(activation, features_names) # Build the topic LDA model dictionary, corpus, lda_model = \ extract_feature.build_lda_model(train_tokens, train_pos, use_nouns=activation[2], use_verbs=activation[3], use_all=activation[4], num_of_topics=activation[0], passes=activation[1], verbose=False) # Get all the train and test features, each in their own dictionary of feature names:feature values train_features = [] for index in range(len(train_tokens)): this_tweet_features = perform_function( extract_feature.get_topic_features, corpus, lda_model, index) train_features.append({**this_tweet_features}) test_features = [] for token, pos in zip(test_tokens, test_pos): this_tweet_features = perform_function( extract_feature.get_topic_features_for_unseen_tweet, dictionary, lda_model, token.split(), pos.split(), activation[2], activation[3], activation[4]) test_features.append({**this_tweet_features}) # Convert feature dictionary to a list of feature values (a dict vectorizer) train_features, test_features = utils.extract_features_from_dict( train_features, test_features) # Run the models utils.run_supervised_learning_models(train_features, train_labels, test_features, test_labels)
# Preechendo os valores nulas com a mediana df = df.fillna(df.median()) # Exibindo algumas das linhas tinham valores nulos via indíces: df.iloc[[32, 126, 374], ] # Definindo as variáveis dependentes/independentes. X = df.iloc[:, 1:8] y = df.iloc[:, 0] # Dividindo o dataset em conjunto de treinamento e testes X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Normalização das features X_train = feature_scaling(X_train) X_test = feature_scaling(X_test) # Treinando o modelo de regressão com o conjunto de treinamento regressor = DecisionTreeRegressor(random_state=0) regressor.fit(X_train, y_train) # Prevendo os resultados com o conjunto de testes y_pred = regressor.predict(X_test) # Avaliando o modelo com a métrica r2 regressor.score(X_test, y_test) # Avaliando o modelo com a métrica rmse mean_squared_error(y_test, y_pred)
def feature_scaling(self): self.T, self.feature_scaling_coeficient = utils.feature_scaling(self.T) self.training_features = np.matrix.transpose(self.T)