def run_classifiers_cv(file_names, ptest=0.2, cv=3, max_sentences=4, as_sentences=False, labels=['ACCOUNT']): data = load_xlsx_data(file_names, max_sentences=max_sentences, as_sentences=as_sentences, labels=labels) vectorizers_dict = gen_tfidf_vectorizers_dict() classifiers_dict = gen_classifiers_dict() cv_results_df = pd.DataFrame(columns=[ 'cv', 'classifier', 'vectorizer', 'precision', 'recall', 'fscore', 'label', 'num_train_exs', 'num_test_exs' ]) random.seed(a=3) for c in range(cv): print("*****************START CV=" + str(c) + "************************") start_time = time.time() rand_int = random.randint(0, 1000) data_train, data_test = train_test_split(pd.DataFrame(data), test_size=ptest, random_state=rand_int) for label in labels: if sum(data_train[label]) < 2 or sum(data_test[label]) < 2: print("less than 2 examples of label: " + label) elif any(data_train[label] > 1): # needs at least 2 examples? print("given labels invalid: " + label) # elif not any(data_train[label] == 1): # print("No labels of class 1 for label: "+label) # print(data_train[label]) else: print("num train exs: " + str(sum(data_train[label]))) results = run_classifiers(data_train, data_test, vectorizers_dict, classifiers_dict, label=label) results['cv'] = pd.Series([c] * results.shape[0]) results['num_train_exs'] = pd.Series([sum(data_train[label])] * results.shape[0]) results['num_test_exs'] = pd.Series([sum(data_test[label])] * results.shape[0]) cv_results_df = cv_results_df.append(results, ignore_index=True) cv_results_df.to_csv( results_path + "cv_" + str(c) + str(label) + "_label_fulldata_excerpts_tfidf_cv_results_labels.csv") end_time = time.time() print("Time for cv=" + str(c) + " : " + str(end_time - start_time)) cv_results_df.to_csv(results_path + "cv_" + str(c) + "_fulldata_excerpts_tfidf_cv_results_labels.csv") return cv_results_df
def produce_visualization( file_names=["Isla Vista - All Excerpts - 1_2_2019.xlsx"], tokenizer=stem_tokenizer, labels=['ACCOUNT', 'HERO'], max_sentences=None, as_sentences=False, output_file='ldavis'): data = load_data.load_xlsx_data(file_names, max_sentences=max_sentences, as_sentences=as_sentences, labels=labels) excerpts = list(data['Excerpts']) # exclude labels with no true label keep_labels = [] for lab in labels: if sum(data[lab]) > 0: keep_labels.append(lab) else: print(lab + " label not present in files: " + str(file_names)) labels = keep_labels # create a subset of the data frame that is the account label types main_types_df = data[labels] main_types_df.index = range(1, main_types_df.shape[0] + 1) # drop rows and excerpts with no label # build vocab and doc_lengths all_words = [] doc_lengths = [] main_types_excerpts = [] for idx, doc in enumerate(excerpts): if sum(main_types_df.loc[idx + 1]) < 1: # if this document had no main type label main_types_df = main_types_df.drop([idx + 1], axis=0) else: main_types_excerpts.append(doc) doc_toks = stem_tokenizer(doc) all_words.extend(doc_toks) doc_lengths.append(len(doc_toks)) fdist = FreqDist(all_words) fdistmc = fdist.most_common() vocab = [word for word, count in fdistmc] term_frequency = [count for word, count in fdistmc] print("number of labelled documents: " + str(len(doc_lengths))) # build topic-term distribution stop_words = set(stopwords.words('english')) freq_dist_dict = {} topic_size = [] topic_num_words = [] i = 0 for coln in main_types_df.columns: categ_excerpts = list( compress(main_types_excerpts, main_types_df[coln].values)) exq = [tokenizer(doc) for doc in categ_excerpts] excerpt_words = [tok for tok_list in exq for tok in tok_list] i = i + 1 topic_size.append(len(exq)) topic_num_words.append(len(excerpt_words)) #print("Topic "+str(i)+": "+coln+" number of excerpts: "+str(len(exq))) words = [ word for word in excerpt_words if word.lower() not in stop_words and word.isalpha() ] freq_dist_dict[coln] = FreqDist(words) topic_term_dists = [] for coln in main_types_df.columns: ffdist = freq_dist_dict[coln] fdist = [ ffdist.freq(word) if word in ffdist.keys() else np.nextafter( float(0), (1)) for word in vocab ] #print("categ: "+str(coln)+" len of freq dist "+str(len(fdist))+" sum of vetor: "+str(sum(fdist))) topic_term_dists.append([float(i) for i in fdist]) # Document-topic distribution doc_topic_dists = [] for index, rowi in main_types_df.iterrows(): row = list(rowi) if (sum(row) > 1.01 or sum(row) < 0.99): #print(str(index)+" row: "+str(row)) # normalize row row = [r / sum(row) for r in row] if (sum(row) == 0): print(row) doc_topic_dists.append([float(i) for i in row]) # format for pyLDAvis data_dict = { 'topic_term_dists': topic_term_dists, 'doc_topic_dists': doc_topic_dists, 'doc_lengths': doc_lengths, 'vocab': vocab, 'term_frequency': term_frequency } #print('Topic-Term shape: %s' % str(np.array(data_dict['topic_term_dists']).shape)) #print('Doc-Topic shape: %s' % str(np.array(data_dict['doc_topic_dists']).shape)) # save data as json with open(output_file + '.json', 'w') as json_file: json.dump(data_dict, json_file) vis_data = pyLDAvis.prepare(**data_dict, n_jobs=-1) # order the columns for pyldavis col_order = vis_data.topic_order categs = list(main_types_df.columns) string_list = [""] * len(col_order) for idx, i in enumerate(col_order): msg = "Topic " + str(idx + 1) + ": " + categs[ i - 1] + ", number of words: " + str(topic_num_words[i - 1]) print(msg) string_list[idx] = msg with open(output_file + '.txt', 'w') as f: for msg in string_list: f.write("%s\n" % msg) pyLDAvis.save_html(vis_data, output_file + '.html') #if display: #pyLDAvis.display(vis_data) return vis_data