def main(): print("Loading...") x, y = load_dataset("data/amazon_reviews_multilingual_JP_v1_00.tsv", n=5000) x = [clean_html(text, strip=True) for text in x] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) print("Vectorizing") vectorizer = CountVectorizer(tokenizer=tokenize) x_train = vectorizer.fit_transform(x_train) x_test = vectorizer.transform(x_test) print(x_train.shape) print(x_test.shape) print("Selecting features...") selector = SelectKBest(k=7000, score_func=mutual_info_classif) selector.fit(x_train, y_train) x_train_new = selector.transform(x_train) x_test_new = selector.transform(x_test) print(x_train_new.shape) print(x_test_new.shape) print("Evaluating...") clf = LogisticRegression(solver="liblinear") clf.fit(x_train_new, y_train) y_pred = clf.predict(x_test_new) score = accuracy_score(y_test, y_pred) print("{:4f}".format(score))
def main(): logger = logging.getLogger("__name__") x, y = load_dataset("data/amazon_reviews_multilingual_JP_v1_00.tsv", n=5000) x = [clean_html(text, strip=True) for text in x] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=44) vectorizer = TfidfVectorizer(tokenizer=tokenize) x_train_vec = vectorizer.fit_transform(x_train) x_test_vec = vectorizer.transform(x_test) @stop_watch def k_fold_cv(clf, x_train, y_train, k_cv=5) -> None: logger = logging.getLogger("kfoldcvLigging") scores = cross_val_score(clf, x_train, y_train, cv=k_cv) logger.debug(f"CV scores (k={k_cv}): {scores}") logger.debug("Accuracy: {:.4f} (+/- {:.4f})".format(scores.mean(), scores.std()*2)) return None clf = LogisticRegression(solver="liblinear") for k in [3,4,5]: k_fold_cv(clf=clf, x_train=x_train_vec, y_train=y_train, k_cv=k) clf.fit(x_train_vec, y_train) y_pred = clf.predict(x_test_vec) score = accuracy_score(y_pred, y_test) logger.debug("Accuracy score (test): {:.4f}".format(score))
def main(): x, y = load_dataset('data/amazon_reviews_multilingual_JP_v1_00.tsv', n=5000) print('Tokenization') x = [clean_html(text, strip=True) for text in x] x = [' '.join(tokenize(text)) for text in x] x_train, x_test, y_train.y_test = train_test_split(x, y, test_size=0.2, random_state=42) print('Binary') vectorizer = CountVectorizer(binary=True) train_and_eval(x_train, y_train, x_test, y_test, vectorizer) print('COunt') vectorizer = CountVectorizer(binary=False) train_and_eval(x_train, y_train, x_test, y_test, vectorizer) print('TF-IDF') vectorizer = TfidfVectorizer() train_and_eval(x_train, y_train, x_test, y_test, vectorizer) print('Bigram') vectorizer = TfidfVectorizer(ngram_range=(1, 2)) train_and_eval(x_train, y_train, x_test, y_test, vectorizer)
def main(): logger = logging.getLogger('__name__') import MeCab path_neologd = '/usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd' t_mecab = MeCab.Tagger("-Owakati && -d {}".format(path_neologd)) def tokenize_by_mecab(text): return list(t_mecab.parse(text).strip().split(" ")) use_tokens = [tokenize, tokenize_by_mecab] t_names = ["janome", "MeCab"] logger.debug("Loading dataset...") x, y = load_dataset("data/amazon_reviews_multilingual_JP_v1_00.tsv", n=5000) x = [clean_html(text, strip=True) for text in x] for use_token, t_name in zip(use_tokens, t_names): logger.debug("●○ {} ○●".format(t_name)) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=44) logger.debug("Count vectorizing...") vectorizer = CountVectorizer(tokenizer=use_token) x_train = vectorizer.fit_transform(x_train) x_test = vectorizer.transform(x_test) logger.debug(f"x_train's shape is {x_train.shape}") logger.debug(f" x_test's shape is {x_test.shape}") logger.debug("Selecting features...") selector = SelectKBest(k=7000, score_func=mutual_info_classif) # k; number of used features, mutal_info_classif; 相互情報量 selector.fit(x_train, y_train) x_train_new = selector.transform(x_train) x_test_new = selector.transform(x_test) logger.debug(f"x_train_new's shape is {x_train_new.shape}") logger.debug(f" x_test_new's shape is {x_test_new.shape}") logger.debug("Evaluating...") clf = LogisticRegression(solver="liblinear") clf.fit(x_train_new, y_train) y_pred = clf.predict(x_test_new) score = accuracy_score(y_test, y_pred) logger.debug("{:.4f}".format(score))
def main(): x, y = load_dataset("data/amazon_reviews_multilingual_JP_v1_00.tsv", n=5000) x = [clean_html(text, strip=True) for text in x] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) vectorizer = TfidfVectorizer(tokenizer=tokenize) x_train_vec = vectorizer.fit_transform(x_train) x_test_vec = vectorizer.transform(x_test) title = "Learning Curves" cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0) clf = LogisticRegression(solver="liblinear") plot_learning_curve(clf, title, x_train_vec, y_train, cv=cv)
def main(): x, y = load_dataset("data/amazon_reviews_multilingual_JP_v1_00.tsv", n=5000) print("Tokenization") x = [clean_html(text, strip=True) for text in x] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) vectorizer = TfidfVectorizer(tokenizer=tokenize) x_train_vec = vectorizer.fit_transform(x_train) x_test_vec = vectorizer.transform(x_test) clf = LogisticRegression(solver="liblinear") clf.fit(x_train_vec, y_train) y_pred = clf.predict(x_test_vec) score = accuracy_score(y_test, y_pred) print("{:.4f}".format(score))
def main(): x, y = load_dataset('data/amazon_reviews_multilingual_JP_v1_00.tsv', n=5000) x = [clean_html(text, strip=True) for text in x] x_train, x_test, y_train. y_test = train_test_split(x, y, test_size=0.2, random_state=42) vectorizer = TfidfVectorizer(tokenizer=tokenize) x_train_vec = vectorizer.fit_transform(x_train) x_test_vec = vectorizer.transform(x_test) clf = LogisticRegression(solver='liblinear') scores = cross_val_score(clf, x_train_vec, y_train, cv=5) print('scores') print('Accuracy(test): {:4f} (+/- {:4f})'.format(scores.mean(), scores.std() *2)) clf.fit(x_train_vec, y_train) y_pred = clf.pred(x_test_vec) score = accuracy_score(y_test, y_pred) print('Accuracy(test): {:4f}'.format(score))
def main(): x, y = load_dataset("data/amazon_reviews_multilingual_JP_v1_00.tsv", n=5000) x = [clean_html(text, strip=True) for text in x] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) vectorizer = TfidfVectorizer(tokenizer=tokenize) x_train_vec = vectorizer.fit_transform(x_train) x_test_vec = vectorizer.transform(x_test) parameters = {"penalty": ["l1", "l2"], "C": [0.01, 0.03, 0.1, 0.3, 0.7, 1, 1.01, 1.03, 1.07, 1.1, 1.3, 1.7, 3]} lr = LogisticRegression(solver="liblinear") clf = GridSearchCV(lr, parameters, cv=5, n_jobs=-1) clf.fit(x_train_vec, y_train) best_clf = clf.best_estimator_ print("clf.best_params") print("Accuracy(best): {:.4f}".format(clf.best_score_)) y_pred = best_clf.predict(x_test_vec) score = accuracy_score(y_test, y_pred) print("Accuracy(test): {:.4f}".format(score))
def main(): logger = logging.getLogger("__name__") # load dataset x, y = load_dataset("data/amazon_reviews_multilingual_JP_v1_00.tsv", n=5000) # feature engineering x = [clean_html(text, strip=True) for text in x] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=44) # vectorization vectorizer = CountVectorizer(tokenizer=tokenize) x_train = vectorizer.fit_transform(x_train) x_test = vectorizer.transform(x_test) x_train = x_train.toarray() x_test = x_test.toarray() # setting hyperparameters vocab_size = len(vectorizer.vocabulary_) label_size = len(set(y_train)) # build model model = create_model(vocab_size, label_size) model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]) # declare callbacks filepath = "./log/model.h5" cb_early = EarlyStopping( monitor= "val_loss", # monitored parameter, default: val_loss (validation loss) min_delta=0, # default: 0 patience=3, # epochs for stopping, default: 0 verbose=0, # default: 0 mode="auto", # default: auto baseline=None, # default: None restore_best_weights=False # default: False ) cb_modelcp = ModelCheckpoint( filepath= filepath, # path for save model, default: nothing; NECCESARY ARGUMENT monitor= "val_loss", # monitored parameter, default: val_loss (validation loss) verbose=0, # default: 0 save_best_only= True, # if True, save best model only (save HDD usage), default: False save_weights_only= False, # if True, save weights parameter only (not include model architecture), default: False mode="auto", # default: False period=1 # default: False ) # when use Tensorboard, run commands on terminal as shown below; # ``` tensorboard --logdir=./logs (this is default) --bind_all ``` # after training model, u can access to http://localhost:6006 and use tensorboard. # if u wanna exit tensorboard, press Ctrl+C on terminal. cb_tensorb = TensorBoard( log_dir= "./log", # path for save params plotting on tensorboard, delfault: ./logs histogram_freq=0, # default: 0 batch_size=32, # default: 32 write_graph=True, # default: True write_grads=False, # default: False write_images=False, # default: False embeddings_freq=0, # default: 0 embeddings_layer_names=None, # default: None embeddings_metadata=None, # default: None embeddings_data=None, # default: None update_freq="epoch" # default: epoch ) """callbacks = [ cb_early, cb_modelcp, cb_tensorb ]""" callbacks = [ EarlyStopping( patience=3, # epochs for stopping, default: 0 ), ModelCheckpoint( filepath= filepath, # path for save model, default: nothing; NECCESARY ARGUMENT save_best_only= True, # if True, save best model only (save HDD usage), default: False ), TensorBoard( log_dir= "./log", # path for save params plotting on tensorboard, delfault: ./logs ) ] # training model #@stop_watch def train_model(x_train, y_train): return model.fit(x_train, y_train, validation_split=0.2, epochs=100, batch_size=32, callbacks=callbacks) history = train_model(x_train, y_train) # load saved model model = load_model(filepath) # describe model # if fail to run this, try command on terminal; # pip install pydot==1.2.3 pydot_ng && apt-get install graphviz plot_model(model, to_file="./log/model.png") # predict by model text = "このアプリ超最高!" vec = vectorizer.transform([text]) y_pred = model.predict(vec.toarray()) logger.debug(f"""input text is "{text}".""") logger.debug("predict: ", y_pred) # plot acc and loss graphs plot_history(history)
def main(): logger = logging.getLogger('__name__') x, y = load_dataset("data/amazon_reviews_multilingual_JP_v1_00.tsv", n=5000) logger.debug("●○ Tokenization ○●") x = [clean_html(text, strip=True) for text in x] x = [" ".join(tokenize(text)) for text in x] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=44) logger.debug("●○ Binary ○●") vectorizer = CountVectorizer(binary=True) train_and_eval(x_train, y_train, x_test, y_test, vectorizer) logger.debug("●○ Count ○●") vectorizer = CountVectorizer(binary=False) train_and_eval(x_train, y_train, x_test, y_test, vectorizer) logger.debug("●○ TF-IDF; Uni-gram ○●") vectorizer = TfidfVectorizer(ngram_range=(1, 1)) train_and_eval(x_train, y_train, x_test, y_test, vectorizer) logger.debug("●○ TF-IDF; Bi-gram ○●") vectorizer = TfidfVectorizer(ngram_range=(2, 2)) train_and_eval(x_train, y_train, x_test, y_test, vectorizer) logger.debug("●○ TF-IDF; Uni- and Bi-grams ○●") vectorizer = TfidfVectorizer(ngram_range=(1, 2)) train_and_eval(x_train, y_train, x_test, y_test, vectorizer) logger.debug("●○ TF-IDF; Uni-, Bi-, and Tri-grams ○●") vectorizer = TfidfVectorizer(ngram_range=(1, 3)) train_and_eval(x_train, y_train, x_test, y_test, vectorizer) logger.debug("●○ Use MaCab; TF-IDF; Uni-gram ○●") # not written in text import MeCab path_neologd = '/usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd' t_mecab = MeCab.Tagger("-Owakati && -d {}".format(path_neologd)) def tokenize_by_mecab(text): return list(t_mecab.parse(text).strip().split(" ")) x, y = load_dataset("data/amazon_reviews_multilingual_JP_v1_00.tsv", n=5000) x = [clean_html(text, strip=True) for text in x] x = [" ".join(tokenize_by_mecab(text)) for text in x] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=44) vectorizer = TfidfVectorizer(ngram_range=(1, 1)) train_and_eval(x_train, y_train, x_test, y_test, vectorizer) logger.debug( "●○ Use MaCab; TF-IDF; Uni- and Bi-grams ○●") # not written in text vectorizer = TfidfVectorizer(ngram_range=(1, 2)) train_and_eval(x_train, y_train, x_test, y_test, vectorizer) logger.debug("●○ Use MaCab; TF-IDF; Uni-, Bi-, and Tri-grams ○●" ) # not written in text vectorizer = TfidfVectorizer(ngram_range=(1, 3)) train_and_eval(x_train, y_train, x_test, y_test, vectorizer)