def run_svm_stem():
    train_data = pd.read_csv('../datasets/train_set.csv', sep="\t")
    test_data = pd.read_csv('../datasets/test_set.csv', sep="\t")
    scorer = MultiScorer({
        'Accuracy': (accuracy_score, {}),
        'Precision': (precision_score, {
            'average': 'macro'
        }),
        'Recall': (recall_score, {
            'average': 'macro'
        }),
        'F1': (f1_score, {
            'average': 'macro'
        })
    })
    le = preprocessing.LabelEncoder()
    le.fit(train_data["Category"])
    y = le.transform(train_data["Category"])

    print "Preprocessing training data..."
    weighted_titles_train = train_data["Title"]
    for i in range(5):
        weighted_titles_train = weighted_titles_train + " " + train_data[
            "Title"]

    train_data["Content"] = weighted_titles_train + " " + train_data["Content"]
    stemmer = SnowballStemmer("english", ignore_stopwords=True)

    class StemmedCountVectorizer(CountVectorizer):
        def build_analyzer(self):
            analyzer = super(StemmedCountVectorizer, self).build_analyzer()
            return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

    stemmed_count_vect = StemmedCountVectorizer(stop_words='english')
    vectorizer = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS)
    svd_model = TruncatedSVD(n_components=1000)
    svd_transformer = Pipeline([('vect', stemmed_count_vect),
                                ('tfidf', TfidfTransformer()),
                                ('svd', svd_model)])
    svd_matrix_train = svd_transformer.fit_transform(train_data["Content"])
    clf = svm.SVC(kernel='rbf', C=1000, gamma=0.001, random_state=42)
    print "Running SVM classifier with stemming..."
    k_fold = cross_validation.KFold(len(train_data["Content"]),
                                    n_folds=10,
                                    shuffle=True,
                                    random_state=42)
    cross_val_score(clf, svd_matrix_train, y, cv=k_fold, scoring=scorer)
    return scorer.get_results()
def run_svm_opt():
    from config import SVM_OPT as svm_opt

    scorer = MultiScorer({
        'Accuracy': (accuracy_score, {}),
        'Precision': (precision_score, {
            'average': 'macro'
        }),
        'Recall': (recall_score, {
            'average': 'macro'
        }),
        'F1': (f1_score, {
            'average': 'macro'
        })
    })
    train_data = pd.read_csv('../datasets/train_set.csv', sep="\t")
    test_data = pd.read_csv('../datasets/test_set.csv', sep="\t")

    le = preprocessing.LabelEncoder()
    le.fit(train_data["Category"])
    y = le.transform(train_data["Category"])

    print "Preprocessing training data..."
    weighted_titles_train = train_data["Title"]
    for i in range(svm_opt['title_weight'] - 1):
        weighted_titles_train = weighted_titles_train + " " + train_data[
            "Title"]

    train_data["Content"] = weighted_titles_train + " " + train_data["Content"]
    vectorizer = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS)
    svd_model = TruncatedSVD(n_components=svm_opt['n-components'])
    svd_transformer = Pipeline([('tfidf', vectorizer), ('svd', svd_model)])
    svd_matrix_train = svd_transformer.fit_transform(train_data["Content"])
    if (svm_opt['SVM-kernel'] == 'linear'):
        clf = svm.SVC(kernel='linear', C=svm_opt['SVM-C'], random_state=42)
    else:
        clf = svm.SVC(kernel=svm_opt['SVM-kernel'],
                      C=svm_opt['SVM-C'],
                      gamma=svm_opt['SVM-gamma'],
                      random_state=42)
    print "Running SVM classifier..."
    k_fold = cross_validation.KFold(len(train_data["Content"]),
                                    n_folds=10,
                                    shuffle=True,
                                    random_state=42)
    cross_val_score(clf, svd_matrix_train, y, cv=k_fold, scoring=scorer)
    return scorer.get_results()
def run_mnb_opt():
    from config import MNB_OPT as mnb_opt
    scorer = MultiScorer({
        'Accuracy': (accuracy_score, {}),
        'Precision': (precision_score, {
            'average': 'macro'
        }),
        'Recall': (recall_score, {
            'average': 'macro'
        }),
        'F1': (f1_score, {
            'average': 'macro'
        })
    })
    train_data = pd.read_csv('../datasets/train_set.csv', sep="\t")
    test_data = pd.read_csv('../datasets/test_set.csv', sep="\t")

    le = preprocessing.LabelEncoder()
    le.fit(train_data["Category"])
    y = le.transform(train_data["Category"])

    print "Preprocessing training data..."
    weighted_titles_train = train_data["Title"]
    for i in range(mnb_opt['title_weight'] - 1):
        weighted_titles_train = weighted_titles_train + " " + train_data[
            "Title"]

    train_data["Content"] = weighted_titles_train + " " + train_data["Content"]
    vectorizer = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS)
    svd_transformer = Pipeline([('tfidf', vectorizer)])
    svd_matrix_train = svd_transformer.fit_transform(train_data["Content"])
    clf = MultinomialNB()
    print "Running MultinomialNB classifier..."
    k_fold = cross_validation.KFold(len(train_data["Content"]),
                                    n_folds=10,
                                    shuffle=True,
                                    random_state=42)
    cross_val_score(clf, svd_matrix_train, y, cv=k_fold, scoring=scorer)
    return scorer.get_results()
예제 #4
0
def model_score(classifiers):
    train_X = full_data.drop('Stake', axis=1)
    train_y = full_data['Stake']
    scorer = MultiScorer({
        'R-Square': (m.r2_score, {}),
        'MSE': (m.mean_squared_error, {})
    })
    res_score = {}

    for name, clf in classifiers.items():
        start = time.time()
        print(name)
        cross_val_score(clf, train_X, train_y, cv=5, scoring=scorer)
        results = scorer.get_results()
        res_score[name] = results

        for metric_name in results.keys():
            average_score = np.average(results[metric_name])
            print('%s : %f' % (metric_name, average_score))

        print 'time', time.time() - start, '\n\n'

    return res_score
예제 #5
0
features.isnull().sum()
features=features[np.isfinite(features['Urbanicity'])]
features=features[np.isfinite(features['male'])]
features.dropna(axis=0, how='any', thresh=None, inplace=True)


# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features,labels, test_size = 0.2)#, random_state = 42)
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

#scorer = {'acc': 'accuracy','prec_macro': 'precision_macro','rec_micro': 'recall_macro','f1':'f1','roc_auc':'roc+auc'}
scorer = MultiScorer({
    'Accuracy' : (accuracy_score, {}),
    'Precision' : (precision_score, {'pos_label': 1, 'average':'macro'}),
    'Recall' : (recall_score, {'pos_label': 1, 'average':'macro'})})


for val in range (20,60,10): #get penal val
#NESIS 880 
#NTDB 260
#for val in range (3,20): #get tree depth
#for val in range (26,36,1): #get num of estimator
    #clf = RandomForestClassifier(max_depth=7,class_weight={1:880,0:1},max_features='auto',n_estimators=26)
    clf = XGBClassifier(scale_pos_weight=val,max_depth=5,learning_rate =0.001)
    #validated=cross_val_score(clf,train_features,train_labels,cv=5,scoring='f1',n_jobs=8)
    #scores.append(validated)
    cross_val_score(clf,train_features,train_labels,cv=5,scoring=scorer,n_jobs=8)
    results = scorer.get_results()
    for metric_name in results.keys():
예제 #6
0
def classifyData(label, binarydata, textvec, cutoff):
    models = [
        RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
        LinearSVC(),
        MultinomialNB(),
        LogisticRegression(random_state=0)
    ]

    binarydata['category_id'] = binarydata[label].factorize()[0]
    from io import StringIO
    category_id_df = binarydata[[
        label, 'category_id'
    ]].drop_duplicates().sort_values('category_id')
    category_to_id = dict(category_id_df.values)
    id_to_category = dict(category_id_df[['category_id', label]].values)

    CV = 5
    X_train, X_test, y_train, y_test = train_test_split(textvec,
                                                        binarydata[label],
                                                        random_state=0)
    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(X_train)
    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
    tfidf = TfidfVectorizer(sublinear_tf=True,
                            min_df=cutoff,
                            encoding='latin-1',
                            ngram_range=(1, 2))
    features = tfidf.fit_transform(textvec).toarray()

    labels = binarydata['category_id']
    cv_df = pd.DataFrame(index=range(CV * len(models)))
    entries = []
    #print(features, labels)
    for model in models:
        scorer = MultiScorer({
            'F-measure': (f1_score, {
                'pos_label': 1,
                'average': 'binary'
            }),
            'Accuracy': (accuracy_score, {}),
            'Precision': (precision_score, {
                'average': 'binary'
            }),
            'Recall': (recall_score, {
                'average': 'binary'
            })
        })
        model_name = model.__class__.__name__
        cross_val_score(model, features, labels, scoring=scorer, cv=CV)
        results = scorer.get_results()
        print(model_name)
        for metric_name in results.keys():
            average_score = np.average(results[metric_name])
            print('%s : %f' % (metric_name, average_score))
    '''
    cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
    cv_df.groupby('model_name').accuracy.mean()

    X_train, X_test, y_train, y_test = train_test_split(textvec, binarydata[label], random_state = 0)
    count_vec = CountVectorizer()
    X_train_counts = count_vect.fit_transform(X_train)
    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
    clf = MultinomialNB().fit(X_train_tfidf, y_train)
    print(cv_df.groupby('model_name').accuracy.mean())
    '''

    model = LinearSVC()

    X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(
        features, labels, binarydata.index, test_size=0.33, random_state=0)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    from sklearn.feature_selection import chi2

    N = 10
    for Product, category_id in sorted(category_to_id.items()):
        print(Product, category_id)
        indices = np.argsort(model.coef_[int(category_id)])
        feature_names = np.array(tfidf.get_feature_names())[indices]
        unigrams = [
            v for v in reversed(feature_names) if len(v.split(' ')) == 1
        ][:N]
        bigrams = [
            v for v in reversed(feature_names) if len(v.split(' ')) == 2
        ][:N]
        print("# '{}':".format(Product))
        print("  . Top unigrams:\n       . {}".format(
            '\n       . '.join(unigrams)))
        print("  . Top bigrams:\n       . {}".format(
            '\n       . '.join(bigrams)))
예제 #7
0
    #('RF',RandomForestClassifier(n_estimators=100, max_depth=3, random_state=seed,min_samples_leaf=3)),
    #('KNN',KNeighborsClassifier()),
    #('XGB', XGBClassifier(eval_metric='logloss', use_label_encoder=False)),
    ('SVM', SVC(gamma='scale')),
    ('MLP',
     KerasClassifier(build_fn=lambda: create_network(number_of_features),
                     epochs=100,
                     batch_size=100,
                     verbose=0))
]
model_names = [model_name[0] for model_name in models]

scorer = MultiScorer({
    'Accuracy': (accuracy_score, {}),
    'Precision': (precision_score, {}),
    'Recall': (recall_score, {}),
    'F1_score': (f1_score, {}),
    'ROC_AUC': (roc_auc_score, {})
})

cpt_time = []
for name, model in models:
    start_time = time.time()
    print(name)
    model_index = model_names.index(name)
    _ = cross_val_score(model, X, y, scoring=scorer, cv=skf)
    results = scorer.get_results()

    for metric_name in results.keys():
        average_score = np.average(
            results[metric_name][num_fold *
예제 #8
0
from sklearn.metrics import *
from multiscorer import MultiScorer

path = os.getcwd()
# Affectation de X (Variables) et y (Sorties)
X = np.loadtxt(path + '/data/X_train.txt')
y = np.loadtxt(path + '/data/y_train.txt')

scorer = MultiScorer({
    'accuracy': (accuracy_score, {}),
    'precision': (precision_score, {
        'average': 'macro'
    }),
    'recall': (recall_score, {
        'average': 'macro'
    }),
    'AUC': (auc, {
        'reorder': True
    }),
    'F-measure': (f1_score, {
        'average': 'macro'
    })
})

precisions = []


##Affichage des mesures
def report(algo, cv, scorer, temps):
    print("ALGO : {}  \ncv : {}  \nTemps : {} s \
          ".format(algo, cv, round(temps, 3)))