def relearn(self): try: print('getting data ... ') data, k = get_data_dict(self.txt_data_path, print_all=False) print('creating the model ...') tfidf_matrix, labels, self.vectorizer = \ get_vector_space_model(data, min_doc_list_size=0, make_equal_size=False) print('saving vectorizer to disk ...') ds.serialize(self.vectorizer, self.clf_data_path + 'vectorizer') vector_dim = tfidf_matrix.shape[1] if self.svd_dim < vector_dim: print('reducing dimension') self.svd = TruncatedSVD(n_components=self.svd_dim) lsi_matrix = self.svd.fit_transform(tfidf_matrix) print('saving svd transformer to disk ...') ds.serialize(self.svd, self.clf_data_path + 'svd') clf1 = get_classifier(self.clf_name, clf_params=(10, )) self.clf = OneVsRestClassifier(clf1) print('\nfitting classifyer ...') self.clf.fit(lsi_matrix, labels) print('saving classifier to disk ...') ds.serialize(self.clf, self.clf_data_path + 'clf') scores = cross_val_score(self.clf, lsi_matrix, labels, cv=5) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), \ scores.std() * 2)) print('relearning done') except: print('\nrelearning failed\n') Debug.print_exception_info()
def get_vector_space_model_train_test(data_dict, min_doc_list_size=0, make_equal_size=False, train_perc=0.66): """ returns matrixes, labels, vectorizer """ print('\ncreating models ...\n') doc_list = [] labels = [] print('making doc list') for class_name in data_dict: try: dl = data_dict[class_name]['doc_list'] if len(dl) < min_doc_list_size > 0: continue if make_equal_size is False: doc_list += dl labels += len(dl) * [class_name] elif min_doc_list_size > 0: doc_list += random.sample(dl, min_doc_list_size) labels += min_doc_list_size * [class_name] except: Debug.print_exception_info() print('making matrix') doc_list_train, doc_list_test, Y_train, Y_test = train_test_split(\ doc_list, labels, test_size=1-train_perc, random_state=random.randint(0,31)) vectorizer = TfidfVectorizer(stop_words=stop_words, tokenizer=tokenize) matrix_train = vectorizer.fit_transform(doc_list_train) matrix_test = vectorizer.transform(doc_list_test) print('matrix ready') return (matrix_train, matrix_test, Y_train, Y_test, vectorizer)
def get_data_dict(data_path, print_all=False): """ returns dictionary like d['class_name'] => list of text documents """ data = dict() class_names = os.listdir(data_path) not_readed_files_counter = 0 for class_name in class_names: data[class_name] = {'doc_list': [], 'not_readed': 0} files = os.listdir(data_path + class_name) i = 0 for fname in files: i += 1 if print_all: print(class_name, i, ' from ', len(files)) if '.DS_Store' == fname: continue try: f = open(data_path + class_name + '/' + fname, 'r') #, encoding='utf-8') txt = str(f.read()) data[class_name]['doc_list'].append(txt) except: print('-------------------------------------------') print(class_name) print(fname) data[class_name]['not_readed'] += 1 not_readed_files_counter += 1 Debug.print_exception_info() return data, not_readed_files_counter
data_path = clf_data_path # GET DATA if not serialized_data: data, not_readed_files_counter = get_data_dict(txt_data_path) ds.serialize((data, not_readed_files_counter), data_path + 'data') else: try: data, not_readed_files_counter = ds.deserialize(data_path + \ 'data') except: Debug.print_exception_info() # CREATE MODEL if not serialized_model: matrix_train, matrix_test, Y_train, Y_test, vect = \ \ get_vector_space_model_train_test(data, min_doc_list_size, make_equal_size, train_perc) ds.serialize(vect, data_path + 'vectorizer') ds.serialize((matrix_train, matrix_test, Y_train, Y_test),