def perform_vectorization(): csv = data_helper.read_csv(clean_text_directory + filename) corpus = csv[integra_index][:size] counts, vocab = vectorization.create_bag_of_words(corpus) print_functions.print_examples(corpus, counts) print_functions.print_vocabulary(vocab) data_features = vectorization.extract_tfidf(counts) df_data_features = pd.DataFrame(data_features, columns=vocab) columns_to_keep = [] termos_interesse = get_termos_interesse() print "Total Columns: " + str(len(df_data_features.columns)) for column in df_data_features.columns: if np.mean(df_data_features[column]) > threshold or column in termos_interesse: columns_to_keep.append(column) df_data_features = df_data_features[columns_to_keep] print "Columns to Keep: " + str(len(columns_to_keep)) data = pd.DataFrame(csv) data = data[data.columns.values[:-1]] new_columns = ["interesse"] # , "exclusao", "diario", "tipo_ato"] original_columns = data.columns.values for i in range(0, len(new_columns)): data[new_columns[i]] = data[original_columns[i]] data = data[new_columns] new_data = data.join(df_data_features) data_helper.save_file(new_data, features_directory, filename)
roman_numbers = get_roman_numbers() state_initials = bl.get_state_initials() state_names = bl.get_state_names() state_names = [state.split() for state in state_names] state_names = list(itertools.chain.from_iterable(state_names)) state_capitals = bl.get_state_capitals() state_capitals = [capital.split() for capital in state_capitals] state_capitals = list(itertools.chain.from_iterable(state_capitals)) months = get_months() letters = get_letters() law_words = get_law_words() law_words = [stemmer.stem(word) for word in law_words] termos_interesse = get_termos_interesse() portuguese_names = get_portuguese_names() def clean_text( raw_text ): # Function to convert a raw text to a string of words # The input is a single string (a raw text), and # the output is a single string (a preprocessed text) # # 1. Includes a space before "<" to avoid joining two words together pre_text = raw_text.replace("<", " <") # # 2. Some states (such as Acre) uses "_" to separate a line pre_text = pre_text.replace("_", " ") # # 3. Remove HTML review_text = BeautifulSoup(pre_text).get_text()