def run_svm_stem(): train_data = pd.read_csv('../datasets/train_set.csv', sep="\t") test_data = pd.read_csv('../datasets/test_set.csv', sep="\t") scorer = MultiScorer({ 'Accuracy': (accuracy_score, {}), 'Precision': (precision_score, { 'average': 'macro' }), 'Recall': (recall_score, { 'average': 'macro' }), 'F1': (f1_score, { 'average': 'macro' }) }) le = preprocessing.LabelEncoder() le.fit(train_data["Category"]) y = le.transform(train_data["Category"]) print "Preprocessing training data..." weighted_titles_train = train_data["Title"] for i in range(5): weighted_titles_train = weighted_titles_train + " " + train_data[ "Title"] train_data["Content"] = weighted_titles_train + " " + train_data["Content"] stemmer = SnowballStemmer("english", ignore_stopwords=True) class StemmedCountVectorizer(CountVectorizer): def build_analyzer(self): analyzer = super(StemmedCountVectorizer, self).build_analyzer() return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)]) stemmed_count_vect = StemmedCountVectorizer(stop_words='english') vectorizer = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS) svd_model = TruncatedSVD(n_components=1000) svd_transformer = Pipeline([('vect', stemmed_count_vect), ('tfidf', TfidfTransformer()), ('svd', svd_model)]) svd_matrix_train = svd_transformer.fit_transform(train_data["Content"]) clf = svm.SVC(kernel='rbf', C=1000, gamma=0.001, random_state=42) print "Running SVM classifier with stemming..." k_fold = cross_validation.KFold(len(train_data["Content"]), n_folds=10, shuffle=True, random_state=42) cross_val_score(clf, svd_matrix_train, y, cv=k_fold, scoring=scorer) return scorer.get_results()
def run_svm_opt(): from config import SVM_OPT as svm_opt scorer = MultiScorer({ 'Accuracy': (accuracy_score, {}), 'Precision': (precision_score, { 'average': 'macro' }), 'Recall': (recall_score, { 'average': 'macro' }), 'F1': (f1_score, { 'average': 'macro' }) }) train_data = pd.read_csv('../datasets/train_set.csv', sep="\t") test_data = pd.read_csv('../datasets/test_set.csv', sep="\t") le = preprocessing.LabelEncoder() le.fit(train_data["Category"]) y = le.transform(train_data["Category"]) print "Preprocessing training data..." weighted_titles_train = train_data["Title"] for i in range(svm_opt['title_weight'] - 1): weighted_titles_train = weighted_titles_train + " " + train_data[ "Title"] train_data["Content"] = weighted_titles_train + " " + train_data["Content"] vectorizer = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS) svd_model = TruncatedSVD(n_components=svm_opt['n-components']) svd_transformer = Pipeline([('tfidf', vectorizer), ('svd', svd_model)]) svd_matrix_train = svd_transformer.fit_transform(train_data["Content"]) if (svm_opt['SVM-kernel'] == 'linear'): clf = svm.SVC(kernel='linear', C=svm_opt['SVM-C'], random_state=42) else: clf = svm.SVC(kernel=svm_opt['SVM-kernel'], C=svm_opt['SVM-C'], gamma=svm_opt['SVM-gamma'], random_state=42) print "Running SVM classifier..." k_fold = cross_validation.KFold(len(train_data["Content"]), n_folds=10, shuffle=True, random_state=42) cross_val_score(clf, svd_matrix_train, y, cv=k_fold, scoring=scorer) return scorer.get_results()
def run_mnb_opt(): from config import MNB_OPT as mnb_opt scorer = MultiScorer({ 'Accuracy': (accuracy_score, {}), 'Precision': (precision_score, { 'average': 'macro' }), 'Recall': (recall_score, { 'average': 'macro' }), 'F1': (f1_score, { 'average': 'macro' }) }) train_data = pd.read_csv('../datasets/train_set.csv', sep="\t") test_data = pd.read_csv('../datasets/test_set.csv', sep="\t") le = preprocessing.LabelEncoder() le.fit(train_data["Category"]) y = le.transform(train_data["Category"]) print "Preprocessing training data..." weighted_titles_train = train_data["Title"] for i in range(mnb_opt['title_weight'] - 1): weighted_titles_train = weighted_titles_train + " " + train_data[ "Title"] train_data["Content"] = weighted_titles_train + " " + train_data["Content"] vectorizer = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS) svd_transformer = Pipeline([('tfidf', vectorizer)]) svd_matrix_train = svd_transformer.fit_transform(train_data["Content"]) clf = MultinomialNB() print "Running MultinomialNB classifier..." k_fold = cross_validation.KFold(len(train_data["Content"]), n_folds=10, shuffle=True, random_state=42) cross_val_score(clf, svd_matrix_train, y, cv=k_fold, scoring=scorer) return scorer.get_results()
def model_score(classifiers): train_X = full_data.drop('Stake', axis=1) train_y = full_data['Stake'] scorer = MultiScorer({ 'R-Square': (m.r2_score, {}), 'MSE': (m.mean_squared_error, {}) }) res_score = {} for name, clf in classifiers.items(): start = time.time() print(name) cross_val_score(clf, train_X, train_y, cv=5, scoring=scorer) results = scorer.get_results() res_score[name] = results for metric_name in results.keys(): average_score = np.average(results[metric_name]) print('%s : %f' % (metric_name, average_score)) print 'time', time.time() - start, '\n\n' return res_score
features.isnull().sum() features=features[np.isfinite(features['Urbanicity'])] features=features[np.isfinite(features['male'])] features.dropna(axis=0, how='any', thresh=None, inplace=True) # Split the data into training and testing sets train_features, test_features, train_labels, test_labels = train_test_split(features,labels, test_size = 0.2)#, random_state = 42) print('Training Features Shape:', train_features.shape) print('Training Labels Shape:', train_labels.shape) print('Testing Features Shape:', test_features.shape) print('Testing Labels Shape:', test_labels.shape) #scorer = {'acc': 'accuracy','prec_macro': 'precision_macro','rec_micro': 'recall_macro','f1':'f1','roc_auc':'roc+auc'} scorer = MultiScorer({ 'Accuracy' : (accuracy_score, {}), 'Precision' : (precision_score, {'pos_label': 1, 'average':'macro'}), 'Recall' : (recall_score, {'pos_label': 1, 'average':'macro'})}) for val in range (20,60,10): #get penal val #NESIS 880 #NTDB 260 #for val in range (3,20): #get tree depth #for val in range (26,36,1): #get num of estimator #clf = RandomForestClassifier(max_depth=7,class_weight={1:880,0:1},max_features='auto',n_estimators=26) clf = XGBClassifier(scale_pos_weight=val,max_depth=5,learning_rate =0.001) #validated=cross_val_score(clf,train_features,train_labels,cv=5,scoring='f1',n_jobs=8) #scores.append(validated) cross_val_score(clf,train_features,train_labels,cv=5,scoring=scorer,n_jobs=8) results = scorer.get_results() for metric_name in results.keys():
def classifyData(label, binarydata, textvec, cutoff): models = [ RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0), LinearSVC(), MultinomialNB(), LogisticRegression(random_state=0) ] binarydata['category_id'] = binarydata[label].factorize()[0] from io import StringIO category_id_df = binarydata[[ label, 'category_id' ]].drop_duplicates().sort_values('category_id') category_to_id = dict(category_id_df.values) id_to_category = dict(category_id_df[['category_id', label]].values) CV = 5 X_train, X_test, y_train, y_test = train_test_split(textvec, binarydata[label], random_state=0) count_vect = CountVectorizer() X_train_counts = count_vect.fit_transform(X_train) tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) tfidf = TfidfVectorizer(sublinear_tf=True, min_df=cutoff, encoding='latin-1', ngram_range=(1, 2)) features = tfidf.fit_transform(textvec).toarray() labels = binarydata['category_id'] cv_df = pd.DataFrame(index=range(CV * len(models))) entries = [] #print(features, labels) for model in models: scorer = MultiScorer({ 'F-measure': (f1_score, { 'pos_label': 1, 'average': 'binary' }), 'Accuracy': (accuracy_score, {}), 'Precision': (precision_score, { 'average': 'binary' }), 'Recall': (recall_score, { 'average': 'binary' }) }) model_name = model.__class__.__name__ cross_val_score(model, features, labels, scoring=scorer, cv=CV) results = scorer.get_results() print(model_name) for metric_name in results.keys(): average_score = np.average(results[metric_name]) print('%s : %f' % (metric_name, average_score)) ''' cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy']) cv_df.groupby('model_name').accuracy.mean() X_train, X_test, y_train, y_test = train_test_split(textvec, binarydata[label], random_state = 0) count_vec = CountVectorizer() X_train_counts = count_vect.fit_transform(X_train) tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) clf = MultinomialNB().fit(X_train_tfidf, y_train) print(cv_df.groupby('model_name').accuracy.mean()) ''' model = LinearSVC() X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split( features, labels, binarydata.index, test_size=0.33, random_state=0) model.fit(X_train, y_train) y_pred = model.predict(X_test) from sklearn.feature_selection import chi2 N = 10 for Product, category_id in sorted(category_to_id.items()): print(Product, category_id) indices = np.argsort(model.coef_[int(category_id)]) feature_names = np.array(tfidf.get_feature_names())[indices] unigrams = [ v for v in reversed(feature_names) if len(v.split(' ')) == 1 ][:N] bigrams = [ v for v in reversed(feature_names) if len(v.split(' ')) == 2 ][:N] print("# '{}':".format(Product)) print(" . Top unigrams:\n . {}".format( '\n . '.join(unigrams))) print(" . Top bigrams:\n . {}".format( '\n . '.join(bigrams)))
#('RF',RandomForestClassifier(n_estimators=100, max_depth=3, random_state=seed,min_samples_leaf=3)), #('KNN',KNeighborsClassifier()), #('XGB', XGBClassifier(eval_metric='logloss', use_label_encoder=False)), ('SVM', SVC(gamma='scale')), ('MLP', KerasClassifier(build_fn=lambda: create_network(number_of_features), epochs=100, batch_size=100, verbose=0)) ] model_names = [model_name[0] for model_name in models] scorer = MultiScorer({ 'Accuracy': (accuracy_score, {}), 'Precision': (precision_score, {}), 'Recall': (recall_score, {}), 'F1_score': (f1_score, {}), 'ROC_AUC': (roc_auc_score, {}) }) cpt_time = [] for name, model in models: start_time = time.time() print(name) model_index = model_names.index(name) _ = cross_val_score(model, X, y, scoring=scorer, cv=skf) results = scorer.get_results() for metric_name in results.keys(): average_score = np.average( results[metric_name][num_fold *
from sklearn.metrics import * from multiscorer import MultiScorer path = os.getcwd() # Affectation de X (Variables) et y (Sorties) X = np.loadtxt(path + '/data/X_train.txt') y = np.loadtxt(path + '/data/y_train.txt') scorer = MultiScorer({ 'accuracy': (accuracy_score, {}), 'precision': (precision_score, { 'average': 'macro' }), 'recall': (recall_score, { 'average': 'macro' }), 'AUC': (auc, { 'reorder': True }), 'F-measure': (f1_score, { 'average': 'macro' }) }) precisions = [] ##Affichage des mesures def report(algo, cv, scorer, temps): print("ALGO : {} \ncv : {} \nTemps : {} s \ ".format(algo, cv, round(temps, 3)))