import re regexp = re.compile('(?u)\\b\\w\\w+\\b') en_nlp = spacy.load('en') old_tokenizer = en_nlp.tokenizer en_nlp.tokenizer = lambda string: old_tokenizer.tokens_from_list( regexp.findall(string)) def custom_tokenizer(document): #커스텀 토큰 분할기 doc_spacy = en_nlp(document, entity=False, parse=False) return [token.lemma_ for token in doc_spacy] lemma_vect = CountVectorizer(tokenizer=custom_tokenizer, min_df=5) X_train_lemma = lemma_vect.fit_trainsform(text_train) print("X_train_lemma.shape : {}".format(X_train_lemma.shape)) vect = CountVectorizer(min_df=5).fit(text_train) X_train = vect.transform(text_train) print("X_train.shape : {}".format(X_train.shape)) #그리드서치 from sklearn.model_selection import StratifiedShuffleSplit param_grid = {"C": [0.001, 0.01, 0.1, 1, 10]} cv = StratifiedShuffleSplit(n_splits=5, test_size=0.99, train_size=0.01, random_state=0) grid = GridSearchCV(LogisticRegression(), param_grid, cv=cv) grid.fit(X_train, y_train)
from sklearn.naive_bayes import MultinomialNB from sklearn.model_selection import GridSearchCV from sklearn import metrics from sklearn.metrics import precision_recall_curve ''' pipe = Pipeline([('vect', CountVectorizer()), ('clf', MultinomialNB())]) ''' param_grid = { 'vect__min_df': [1, 2, 3, 4, 5], 'clf__alpha': [1, 0.1, 0.01, 0.001, 0.0001, 0.00001] } vectorizer = CountVectorizer(min_df=5) Xtrain = vectorizer.fit_transform(X_train) Xtest = vectorizer.fit_trainsform(X_test) filename = "./model_save/naive_bayes_" import datetime now = datetime.datetime.now().strftime("%Y%m%d%H%M") filename = './model_save/naive_bayes_' + now import csv f = open(filename + ".csv", "w") csvWrite = csv.writer(f) csvWrite.writerow(["min_dif", "alpha", "score", "recall", "precision"]) count = 1 for alpha in param_grid['clf__alpha']: