def test_vectorize_text(): # no processing result = process_text.vectorize_text(text_df, "text", remove_stopwords=False, tfidf=False, lemma=False, lsa=False) assert len(result.columns) == 12 # no stop words result = process_text.vectorize_text(text_df, "text", remove_stopwords=True, tfidf=False, lemma=False, lsa=False) assert "so" not in list(result.columns) result = process_text.vectorize_text(text_df, "text", remove_stopwords=False, tfidf=False, lemma=True, lsa=False) assert len(result.columns) == 12 assert "be" in list(result.columns)
TEXT_FILE_PATH = start.CLEAN_DATA_PATH + "text.csv" ID = "id_attempt" COLUMNS = ["text_clean"] # %% df = pd.read_csv(TEXT_FILE_PATH).set_index(ID) df = df[COLUMNS] # %% # %% Version 1 matrix = process_text.vectorize_text( df, text_col="text_clean", remove_stopwords=True, tfidf=False, lemma=False, lsa=False, ) file = start.RESULTS_PATH + "Pilot Study/Cosine Replicability 1.xlsx" wb = load_workbook(file) ws = wb.active col = 2 for main in list(matrix.index): row = 2 for comp in list(matrix.index): dist = 1 - scipy.spatial.distance.cosine(matrix.loc[main], matrix.loc[comp]) ws.cell(row=row, column=col).value = round(dist, 2)
) # Print the Keyword in the 10 topics print(lda_model.print_topics()) doc_lda = lda_model[corpus] # %% # Visualize the topics pyLDAvis.enable_notebook() vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word) vis # %% CLUSTER df = survey1[["text"]] matrix = process_text.vectorize_text(df, text_col="text", remove_stopwords=True, tfidf=True) num_clusters = 3 km = KMeans(n_clusters=num_clusters) km.fit(matrix) clusters = km.labels_.tolist() df["cluster"] = clusters grouped = df["text"].groupby(df["cluster"]) # %% print("Top terms per cluster:") print() # sort cluster centers by proximity to centroid order_centroids = km.cluster_centers_.argsort()[:, ::-1]