def _bow_svm(train_bow_file_name, train_label_file_name, test_bow_file_name, test_label_file_name): print 'loading file ...' train_word_indices_list, train_word_cnts_list, num_words = ioutils.load_bow_file(train_bow_file_name, False) test_word_indices_list, test_word_cnts_list, num_words = ioutils.load_bow_file(test_bow_file_name, False) print num_words, 'words' idfs = _get_idf_values(train_word_indices_list, train_word_cnts_list, num_words) print idfs print 'to sparse ...' train_cm = _word_cnts_to_bow_vecs(train_word_indices_list, train_word_cnts_list, num_words, idfs) test_cm = _word_cnts_to_bow_vecs(test_word_indices_list, test_word_cnts_list, num_words, idfs) # print train_cm[0] train_y = ioutils.load_labels_file(train_label_file_name) test_y = ioutils.load_labels_file(test_label_file_name) print 'training svm ...' # clf = svm.SVC(decision_function_shape='ovo') clf = svm.LinearSVC() clf.fit(train_cm, train_y) print 'done.' y_pred = clf.predict(test_cm) ftmp = open('e:/dc/20ng_data/tmp_labels.txt', 'wb') for i in xrange(len(y_pred)): ftmp.write(str(y_pred[i]) + '\t' + str(test_y[i]) + '\n') ftmp.close() print 'accuracy', accuracy_score(test_y, y_pred) print 'precision', precision_score(test_y, y_pred, average='macro') print 'recall', recall_score(test_y, y_pred, average='macro') print 'f1', f1_score(test_y, y_pred, average='macro')
def __bow_svm(train_bow_file_name, train_label_file_name, test_bow_file_name, test_label_file_name): print 'loading file ...' uint16_cnts = True train_word_indices_list, train_word_cnts_list, num_words = ioutils.load_bow_file( train_bow_file_name, uint16_cnts) test_word_indices_list, test_word_cnts_list, num_words = ioutils.load_bow_file( test_bow_file_name, uint16_cnts) print num_words, 'words' idfs = _get_idf_values(train_word_indices_list, train_word_cnts_list, num_words) print idfs train_y = ioutils.load_labels_file(train_label_file_name) # print train_y[:100] print Counter(train_y) test_y = ioutils.load_labels_file(test_label_file_name) # print test_y[:100] print Counter(test_y) print 'to sparse ...' train_cm = _word_cnts_to_bow_vecs(train_word_indices_list, train_word_cnts_list, num_words, idfs) test_cm = _word_cnts_to_bow_vecs(test_word_indices_list, test_word_cnts_list, num_words, idfs) # print train_cm[0] print 'training svm ...' # clf = svm.SVC(decision_function_shape='ovo', kernel='poly', degree=2) clf = svm.SVC(decision_function_shape='ovo', kernel='linear') # clf = svm.LinearSVC() clf.fit(train_cm, train_y) print 'done.' y_pred = clf.predict(test_cm) print y_pred[:100] print Counter(y_pred) # ftmp = open('e:/data/emadr/20ng_data/tmp_labels.txt', 'wb') # for i in xrange(len(y_pred)): # ftmp.write(str(y_pred[i]) + '\t' + str(test_y[i]) + '\n') # ftmp.close() # print 'accuracy', accuracy_score(test_y, y_pred) # print 'precision', precision_score(test_y, y_pred, average='macro') # print 'recall', recall_score(test_y, y_pred, average='macro') # print 'f1', f1_score(test_y, y_pred, average='macro') acc = accuracy_score(test_y, y_pred) prec = precision_score(test_y, y_pred, average='macro') recall = recall_score(test_y, y_pred, average='macro') f1 = f1_score(test_y, y_pred, average='macro') print 'accuracy', acc print 'precision', prec print 'recall', recall print 'macro f1', f1 print '%f\t%f\t%f\t%f' % (acc, prec, recall, f1) return acc, prec, recall, f1
def __bow_lr(train_bow_file_name, train_label_file_name, test_bow_file_name, test_label_file_name): print 'loading file ...' uint16_cnts = True train_word_indices_list, train_word_cnts_list, num_words = ioutils.load_bow_file( train_bow_file_name, uint16_cnts) test_word_indices_list, test_word_cnts_list, num_words = ioutils.load_bow_file( test_bow_file_name, uint16_cnts) print num_words, 'words' idfs = _get_idf_values(train_word_indices_list, train_word_cnts_list, num_words) print idfs train_y = ioutils.load_labels_file(train_label_file_name) # print train_y[:100] print Counter(train_y) test_y = ioutils.load_labels_file(test_label_file_name) # print test_y[:100] print Counter(test_y) print 'to sparse ...' train_cm = _word_cnts_to_bow_vecs(train_word_indices_list, train_word_cnts_list, num_words, idfs) test_cm = _word_cnts_to_bow_vecs(test_word_indices_list, test_word_cnts_list, num_words, idfs) # print train_cm[0] print 'training lr ...' lr = LogisticRegression(C=1000, multi_class='multinomial', solver='newton-cg') lr.fit(train_cm, train_y) print 'done.' y_pred = lr.predict(test_cm) # print y_pred[:100] print Counter(y_pred) # ftmp = open('e:/data/emadr/20ng_data/tmp_labels.txt', 'wb') # for i in xrange(len(y_pred)): # ftmp.write(str(y_pred[i]) + '\t' + str(test_y[i]) + '\n') # ftmp.close() acc = accuracy_score(test_y, y_pred) prec = precision_score(test_y, y_pred, average='macro') recall = recall_score(test_y, y_pred, average='macro') f1 = f1_score(test_y, y_pred, average='macro') print 'accuracy', acc print 'precision', prec print 'recall', recall print 'macro f1', f1 print '%f\t%f\t%f\t%f' % (acc, prec, recall, f1)
def __get_bow_vecs(dw_file): print 'loading file ...' word_indices_list, word_cnts_list, num_words = ioutils.load_bow_file(dw_file) print num_words, 'words' idfs = _get_idf_values(word_indices_list, word_cnts_list, num_words) print 'to sparse ...' bow_vecs = _word_cnts_to_bow_vecs(word_indices_list, word_cnts_list, num_words, idfs) return bow_vecs
def __get_bow_vecs(dw_file): print 'loading file ...' word_indices_list, word_cnts_list, num_words = ioutils.load_bow_file( dw_file) print num_words, 'words' idfs = _get_idf_values(word_indices_list, word_cnts_list, num_words) print 'to sparse ...' bow_vecs = _word_cnts_to_bow_vecs(word_indices_list, word_cnts_list, num_words, idfs) return bow_vecs
def close_words_of_docs(): word_dict_file_name = 'e:/dc/20ng_bydate/words_dict.txt' words = ioutils.load_words_dict_to_list(word_dict_file_name) bow_docs_file_name = 'e:/dc/20ng_bydate/all_docs_dw_net.bin' word_indices_list, word_cnts_list, num_words = ioutils.load_bow_file(bow_docs_file_name) print num_words, 'words' doc_vec_file_name = 'e:/dc/20ng_bydate/vecs/doc_vec_cpp_100.bin' word_vec_file_name = 'e:/dc/20ng_bydate/vecs/word_vecs_cpp_100.bin' doc_vecs = ioutils.load_vec_list_file(doc_vec_file_name) word_vecs = ioutils.load_vec_list_file(word_vec_file_name) def show_close_words(doc_vec, word_indices): dist_list = list() for word_idx in word_indices: dist_list.append((np.dot(doc_vec, word_vecs[word_idx]), word_idx)) dist_list.sort(key=lambda tup: tup[0]) # closest_words = heapq.nlargest(10, dist_list, key=lambda tup: tup[0]) for dist, idx in dist_list: print dist, words[idx] show_close_words(doc_vecs[0], word_indices_list[0])
def close_words_of_docs(): word_dict_file_name = 'e:/dc/20ng_bydate/words_dict.txt' words = ioutils.load_words_dict_to_list(word_dict_file_name) bow_docs_file_name = 'e:/dc/20ng_bydate/all_docs_dw_net.bin' word_indices_list, word_cnts_list, num_words = ioutils.load_bow_file( bow_docs_file_name) print num_words, 'words' doc_vec_file_name = 'e:/dc/20ng_bydate/vecs/doc_vec_cpp_100.bin' word_vec_file_name = 'e:/dc/20ng_bydate/vecs/word_vecs_cpp_100.bin' doc_vecs = ioutils.load_vec_list_file(doc_vec_file_name) word_vecs = ioutils.load_vec_list_file(word_vec_file_name) def show_close_words(doc_vec, word_indices): dist_list = list() for word_idx in word_indices: dist_list.append((np.dot(doc_vec, word_vecs[word_idx]), word_idx)) dist_list.sort(key=lambda tup: tup[0]) # closest_words = heapq.nlargest(10, dist_list, key=lambda tup: tup[0]) for dist, idx in dist_list: print dist, words[idx] show_close_words(doc_vecs[0], word_indices_list[0])