def load_predict(directory): clf = joblib.load('svm_trained.joblib') documents = utils.load_dirs_custom(directory) documents = utils.n_gram_documents_range(documents, 5, 6) documents = np.array(documents) doc_test = utils.convert_docs_to_lines(documents) predicted_lines = [] target_lines = [] for doc in doc_test: lines = clf.predict(doc.data) predicted_lines += list(lines) return predicted_lines
from keras.regularizers import l1 from keras.callbacks import EarlyStopping import utils from keras import backend as K import numpy as np import pickle from keras.utils import np_utils #utils.personal_categories_dict (.inv) documents = utils.load_dirs_custom([ '../../TAGGED_DATA_NEW_NEW/SENSITIVE_DATA/html-tagged', '../../TAGGED_DATA_NEW_NEW/PERSONAL_DATA/html-tagged', '../../TAGGED_DATA_NEW_NEW/NON_PERSONAL_DATA' ], individual=True) x = [] y = [] for document in documents: lines = document.lines categories = [] for line in lines: for category in line.categories: if category not in categories: categories.append(category) x += ['\n'.join(document.data)] y += [categories]
from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.model_selection import train_test_split from sklearn.model_selection import RandomizedSearchCV from sklearn.metrics import confusion_matrix from sklearn.metrics import fbeta_score from sklearn.model_selection import GridSearchCV import utils from sklearn.pipeline import Pipeline from sklearn.linear_model import SGDClassifier from sklearn import metrics from sklearn.externals import joblib documents = utils.load_dirs_custom([ './SENSITIVE_DATA/html-tagged', './PERSONAL_DATA/html-tagged', './NON_PERSONAL_DATA' ]) documents = utils.n_gram_documents_range(documents, 5, 6) documents = np.array(documents) doc_train, doc_test, = utils.document_test_train_split(documents, 0.01) print("Doc train: ", len(doc_train)) print("Doc test: ", len(doc_test)) X_train, y_train = utils.convert_docs_to_lines(doc_train) X_test, y_test = utils.convert_docs_to_lines(doc_test) text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2))), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier(loss='hinge',
new_arr = [] for value in arr: new_arr.append((value - minimum) / (maximum - minimum)) return new_arr def show_overfit_plot(): plt.plot(history.history['loss']) plt.plot(history.history['val_loss']) plt.legend(['train', 'test'], loc='upper left') plt.show() documents = utils.load_dirs_custom([ '../../anondata_lines/sensitive', '../../anondata_lines/personal', '../../anondata_lines/nonpersonal' ], individual=True) documents = utils.n_gram_documents_range(documents, 8, 8) doc_train, doc_test, = utils.document_test_train_split(documents, 0.05) print("Doc train: ", len(doc_train)) print("Doc test: ", len(doc_test)) x_train, y_train = utils.convert_docs_to_lines(doc_train) x_test, y_test = utils.convert_docs_to_lines(doc_test) y_train = np.where((y_train == 2) | (y_train == 1), 1, 0) y_test = np.where((y_test == 2) | (y_test == 1), 1, 0)