def train_SVC(filePath): '''TRAINING''' train_df = pandas.read_csv(filePath, sep='\t') # train_df = model_utils.oversample_neutral_class(train_df) train_class = train_df[' class'].as_matrix() train_data = model_utils.apply_aspdep_weight(train_df, 0.7) print train_data[0] text_clf = SVC(C=0.2, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma=0.5, kernel='poly', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False).fit(train_data, train_class) print(set(text_clf.predict(train_data))) print(set(train_class)) joblib.dump(text_clf, 'model_dumps/SVC_model.pkl') """PERFORMANCE EVALUATION""" accuracy, clf_report = model_utils.get_cv_metrics(text_clf, train_data, train_class, k_split=10) print("Accuracy: ", accuracy) print(clf_report)
def train_StackedGeneralizer(filePath): """TRAINING""" train_df = pandas.read_csv(filePath, sep='\t') # train_df = model_utils.oversample_neutral_class(train_df) train_class = train_df[' class'].as_matrix() train_data_1 = model_utils.apply_aspdep_weight(train_df, 1.7) # base_models = [MultinomialNB(alpha=0.6, fit_prior=True, class_prior=None), BernoulliNB(alpha=1.2, fit_prior=True, class_prior=None), # linear_model.SGDClassifier(loss='squared_loss', penalty='l2', alpha=1e-3, random_state=607, # max_iter=1000000, tol=1e-2)] base_models = [ joblib.load('Multinomial_nb_model.pkl'), joblib.load('Bernoulli_nb_model.pkl'), joblib.load('SGD_model.pkl'), joblib.load('RF_model.pkl') ] # define blending model blending_model = LogisticRegression(random_state=607) # initialize multi-stage model sg = StackedGeneralizer(base_models, blending_model, n_folds=10, verbose=False) sg.fit(train_data_1, train_class) # joblib.dump(sg, 'Stacked_model.pkl') """PERFORMANCE EVALUATION""" accuracy, clf_report = model_utils.get_cv_metrics(sg, train_data_1, train_class, k_split=10) print("Accuracy: ", accuracy) #Accuracy: 0.7497418660799471 Weights: 1.7
def train_SGD(filePath): '''TRAINING''' train_df = pandas.read_csv(filePath, sep='\t') # train_df = model_utils.oversample_neutral_class(train_df) train_class = train_df[' class'].as_matrix() train_data_1 = model_utils.apply_aspdep_weight(train_df, 0.5) text_clf = linear_model.SGDClassifier( loss='squared_loss', penalty='l2', alpha=1e-3, random_state=607, max_iter=1000000, tol=1e-2).fit(train_data_1, train_class) #Accuracy: 0.7797260574839455 @ 2.1 weight # Accuracy: 0.7428003692958715 Weights: 0.5 joblib.dump(text_clf, 'SGD_model.pkl') """PERFORMANCE EVALUATION""" accuracy, clf_report = model_utils.get_cv_metrics(text_clf, train_data_1, train_class, k_split=10) print("Accuracy: ", accuracy) print(clf_report)
def train_polarity_clf(filePath): train_df = pandas.read_csv(filePath, sep='\t') train_df = model_utils.oversample_neutral_class(train_df) train_class = train_df[' class'].as_matrix() train_data = train_df['opin_polarity'].as_matrix() print train_data # text_clf = BernoulliNB(alpha=1.0, fit_prior=True, class_prior=None).fit(train_data, train_class) text_clf = LogisticRegression(random_state=0).fit(train_data) """PERFORMANCE EVALUATION""" accuracy, clf_report = model_utils.get_cv_metrics(text_clf, train_data, train_class, k_split=10) print("Accuracy: ", accuracy) print(clf_report)
def train_ET(filePath): '''TRAINING''' train_df = pandas.read_csv(filePath, sep='\t') # train_df = model_utils.oversample_neutral_class(train_df) train_class = train_df[' class'].as_matrix() train_data_1 = model_utils.apply_aspdep_weight(train_df, 0.3) text_clf = ExtraTreesClassifier(n_estimators=10, max_depth=2, random_state=0, n_jobs=-1).fit(train_data_1, train_class) """PERFORMANCE EVALUATION""" accuracy, clf_report = model_utils.get_cv_metrics(text_clf, train_data_1, train_class, k_split=10) print("Accuracy: ", accuracy)
def train_BernoulliNB(filePath): '''TRAINING''' train_df = pandas.read_csv(filePath, sep='\t') # train_df = model_utils.oversample_neutral_class(train_df) train_class = train_df[' class'].as_matrix() train_data_1 = model_utils.apply_aspdep_weight(train_df, 0.0) text_clf = BernoulliNB(alpha=0.6, fit_prior=True, class_prior=None).fit( train_data_1, train_class ) # 1.2 Accuracy: 0.7047700662655685 Weights: 0.0 Alpha 0.6 joblib.dump(text_clf, 'Bernoulli_nb_model.pkl') """PERFORMANCE EVALUATION""" accuracy, clf_report = model_utils.get_cv_metrics(text_clf, train_data_1, train_class, k_split=10) print("Accuracy: ", accuracy) print(clf_report)
def train_MultinomialNB(filePath): '''TRAINING''' train_df = pandas.read_csv(filePath, sep='\t') # train_df = model_utils.oversample_neutral_class(train_df) train_class = train_df[' class'].as_matrix() train_data_1 = model_utils.apply_aspdep_weight(train_df, 0.9) text_clf = MultinomialNB(alpha=0.3, fit_prior=True, class_prior=None).fit( train_data_1, train_class) # 1.2, 0.01 Accuracy: 0.7407742366772097 Weights: 1.2 joblib.dump(text_clf, 'Multinomial_nb_model.pkl') """PERFORMANCE EVALUATION""" accuracy, clf_report = model_utils.get_cv_metrics(text_clf, train_data_1, train_class, k_split=10) print("Accuracy: ", accuracy) print(clf_report)
def train_RF(filePath): '''TRAINING''' train_df = pandas.read_csv(filePath, sep='\t') # train_df = model_utils.oversample_neutral_class(train_df) train_class = train_df[' class'].as_matrix() train_data_1 = model_utils.apply_aspdep_weight(train_df, 1.1) text_clf = RandomForestClassifier(n_estimators=300, max_depth=180, random_state=607, n_jobs=-1).fit(train_data_1, train_class) joblib.dump(text_clf, 'RF_model.pkl') """PERFORMANCE EVALUATION""" accuracy, clf_report = model_utils.get_cv_metrics(text_clf, train_data_1, train_class, k_split=10) print("Accuracy: ", accuracy) print(clf_report)