max_val_score = 0 while len(chosen_features) < 1: best_val = 0. best_feature = None for feature in feature_list: if feature in chosen_features: continue sent_encoder.feature_names = chosen_features + [feature] model = SimplePQModel(sent_encoder=sent_encoder, clf_type=AdaBoostClassifier, clf_args={ 'n_estimators': 100, 'base_estimator': DecisionTreeClassifier( max_depth=1, class_weight="balanced") }) model.fit(train_articles) val_accuracy = E.evaluate(model=model, articles=val_articles, verbose=0) test_accuracy = E.evaluate(model=model, articles=test_articles, verbose=0) res_str = "{}\t{:.1f}\t{:.1f}".format( ', '.join(chosen_features + [feature]), 100 * val_accuracy,
'🇸', '🌈', '💙', '\u200e', 'å', '🖤', '‐', '̂', '💥', '😉', 'щ', 'ё', '🤷', 'ц', '💖', '🍹', '🍊', '{', '}', '🤔', '💋', '\U0001f9e1', '\u200f', '𝐞', '🛑', 'º', '🇹', '☕', 'ɪ', '🌊', '💁', '🇺', '🤗', '🙏', '×', '💚', '¯', '🌺', '→', '♂', '전', '율', '리', '✈', 'ń', '💦', '☁', '🌟', 'ʼ', '‚', '👏', '😏', '🤦', '🎉', '🎄', '💪', '👄', '👑', '♥', '¹', 'ā', '💍', '𝐖', '𝐨', '𝐥', '👌', '🥂', '💅', '☝', '👋', '😃', '⚾', '😅', '🛁', '🇷', '̀', 'ю', 'э', '▪', 'ᴛ', 'ɴ', 'ᴀ', 'ᴇ', '🐝', '➡', '🌅', '⛱', 'ś', '🇧', '💕', '⚡', '🌎', '\\', 'ツ', 'œ', '👯', '😭', '💔', '👀', '🍕', '🎶', '🙆', 'ž', '😊', '�', '📍', '🍃', '💎', '⛵', '♡', '😳', '\U0001f90d', '😐', 'ù', '🌸', '¬', '‑', '👎', '\U0001f9d8', '−', '🐋', '🎀', '👸', '😆', '💸', '😪', '🍎', '👭', '😋', '🖕', '😑', '🐈', '👜', '🙀', '😼', '😽', '👊', '´', '🤓', '\U0001f929', '😎', '🤡', '🎅', '🔱', '💄', '⚜', '🇫', '𝐝', '𝐓', '𝐚', '𝐯', '𝐌', '𝐦', '𝐢', '𝐭', '¾', 'ℓ', '¨', '👰', '😁', '🏋', '😝', '👇', '😚', '💏', '🗳', '🏙', '🕌', 'ф', '👗', '🛍', 'ʀ', 'ʟ', '💌', '🛣', '🚖', '🔍', '🏬', '🌉', '🎭', '🏠', '🌳', '📏', 'ą', '🏅', '😩', '💀', '🎃', '👶', '', '🥞', 'fi', '🐔', '🏕', '̊', '\U0001f974', '🙃', '✊', '̃', '🌚', '🌻', '😬', '🙋', '🙂', '🇮', '☺', '🏄', '💐', '🏁', '🤠', '😢', '❗', '♐', '🍟', 'm', 'u', 'n', 'r', 'o', 'e', '🎼', 'ʔ', '≠', '😘', '😵', '🍅', '😮', '🐚', '🤸', '✮', '🦐', '🐠', '🌏', '⃣', '🛶', '😻', '\U0001f9d0', '🇭', '🚜', 'ß', '💰', '\u2009', '¼', '💡', '🚁', '🇬', '🐣', '🗺', '\U0001f6f8', '🌑', '🤘', '🌋', '🥐', '🧀', '🍳', '🥓', '🍷', '🥑', '🍤', '🍸', '😰', '─', '😜', '🌼', '🎠', '🙈', '🎡', '🏾', '🍑', '🍫', 'ć', '⛈', 'ʺ', '☾', '˚', '\U0001f976', '🏖', '\U0001f9da', '🐶', '🍁', '⚓', 'ż'] for char_sets in char_sets_options: for quantiles in [1, 2, 5, 10, 15]: sent_encoder = TrueCharPositionEncoder(char_sets = char_sets, quantiles=quantiles) model = SimplePQModel(sent_encoder=sent_encoder, clf_type=LogisticRegression, clf_args={'class_weight':'balanced', 'max_iter':1000, 'solver':'lbfgs'}) #model = SimpleNNModel(sent_encoder=sent_encoder, layer_sizes=layer_sizes, layer_dropouts=layer_dropouts) model.fit(train_articles) #coefs = model.model.coef_[0] #coef_str = "\t".join([str(round(v, 3)) for v in coefs]) # #val_accuracy = E.evaluate(model=model, articles=val_articles, verbose=0) test_accuracy = E.evaluate(model=model, articles=test_articles, verbose=0) res_str = "{}\t{}\t{}\t{:.2f}".format(sent_encoder.name, char_sets, quantiles, 100*test_accuracy) print(res_str) results_file.write(res_str+"\n") results_file.flush() ''' for quantiles in [2, 5, 10, 20]:
from sklearn.ensemble import AdaBoostClassifier from sklearn.tree import DecisionTreeClassifier from models.sentence_encoders import HandcraftedEncoder #sent_encoder = HandcraftedEncoder() sent_encoder = HandcraftedEncoder(precomputed_embeddings=settings.PRECOMPUTED_HANDCRAFTED_EMBEDDINGS_FNAME) feature_list = ["Quote_count", "Sent_position", "R_difficult", "POS_PRP", "POS_VB", "A_concreteness"] #HandcraftedEncoder._all_features + "best" #feature = "best" for feature in feature_list: print(feature) sent_encoder.set_features(feature) model = SimplePQModel(sent_encoder=sent_encoder, clf_type=AdaBoostClassifier, clf_args={'n_estimators':100, 'base_estimator':DecisionTreeClassifier(max_depth=1, class_weight="balanced")}) print("training {}...".format(feature)) model.fit(train_articles) print("generating...") combined_samples[feature] = generate_samples(model, test_articles) elif model_name == "ngrams": from models.sentence_encoders import NGramEncoder for mode, n in [('char', 2), ('word', 1)]: print(mode, n) sent_encoder = NGramEncoder(mode=mode, n=n, store_results=False, vocab_size=1000) print("preparing encoder...")