def version2():  # Data cleaning in NLP Model
    corpus = []

    for i in range(0, 527383):
        review = re.sub(
            '[^a-zA-Z]', ' ',
            df.iloc[i,
                    1])  # Removing all elements except words from all reviews
        review = review.lower()
        review = review.split()
        review = [
            word for word in review if not word in set(sw.words('english'))
        ]
        stammer = ps()
        review = [stammer.stem(word) for word in review]
        review = " ".join(review)
        corpus.append(review)

    features = cv().fit_transform(corpus)
    labels = df.iloc[:, -1]

    train_test_split(features, labels, 100)

    features_test_vectorized = cv().transform(features_test)
    features_train_vectorized = cv().fit_transform(features_train)

    model = lr().fit(features_train_vectorized, labels_train)
    predictions = model.predict(features_test_vectorized)
    ras(labels_test, predictions)
    cm(labels_test, predictions)

    return model
def version1():  # Logistic Regression Model
    train_test_split(df["reviewText"], df["Positivity"], 100)

    features_train_vectorized = cv().fit_transform(features_train)
    features_test_vectorized = cv().transform(features_test)

    model = lr().fit(features_train_vectorized,
                     labels_train)  # Model creation for logistic regression
    predictions = model.predict(features_test_vectorized)

    ras(labels_test, predictions)  # Generating prediction score
    cm(labels_test, predictions)

    return model
def version3():  # TF_IDF Model
    global vect

    train_test_split(df["reviewText"], df["Positivity"], 100)

    vect = TfidfVectorizer(min_df=5)
    features_train_vectorized = vect.fit_transform(features_train)
    features_test_vectorized = vect.transform(features_test)

    model = lr().fit(features_train_vectorized, labels_train)
    predictions = model.predict(features_test_vectorized)
    ras(labels_test, predictions)
    cm(labels_test, predictions)

    return model
Пример #4
0
def roc_scores(data):
    y = data
    rocs = pandas.Series()
    for c in y.columns.values[1:]:
        temp1 = ras(y['true'], y[c])
        rocs.set_value(c,temp1)
        # temp = pandas.Series([temp1], index=[c])
        # rocs = rocs.append(temp) # also works fine
    return rocs
Пример #5
0
def dfauc(x):
    y = 0
    length = len(df)
    for i in range(len(df)):
        if i == 0:
            average = np.multiply(x[i], df[i].target.values)
            y = y + x[i]
        elif i < (len(df) - 1):
            average = average + np.multiply(x[i], df[i].target.values)
            y = y + x[i]
        else:
            average = average + np.multiply(1 - y, df[i].target.values)
    auc_score = ras(df[i].ref.values, average)
    return -1 * auc_score
Пример #6
0
]
totalScores = np.zeros(7)
for subject in subjects:
    scores = np.zeros(7)
    print 'calculating scores for subject: ' + str(subject)
    for serie in series:
        data = pd.read_csv(
            'SVM_results_binary_allCSP/subj%d_series%d_results.csv' %
            (subject, serie))
        data = np.array(data[data.columns[1:]])
        truth = pd.read_csv('input/train/subj%d_series%d_events.csv' %
                            (subject, serie))
        truth = np.array(truth[cols])

        for i in range(0, 6):
            scores[i] += ras(truth[:, i], data[:, i])

        scores[6] += ras(truth, data, average='macro')

    scores = np.true_divide(scores, len(series))
    totalScores += scores

    print 'Writing scores for subject: ' + str(subject)
    f = open('SVM_scores/subj%d_mean_scores.txt' % (subject), 'w')
    f.write('Average AUC score: {}\n'.format(scores[6]))
    f.write('Scores by Event:\n')
    f.write('{0:>20} {1:>20} {2:>20} {3:>20} {4:>20} {5:>20}'.format(
        cols[0], cols[1], cols[2], cols[3], cols[4], cols[5]))
    f.write('\n')
    f.write('{0:20.5} {1:20.5} {2:20.5} {3:20.5} {4:20.5} {5:20.5}'.format(
        scores[0], scores[1], scores[2], scores[3], scores[4], scores[5]))
Пример #7
0
def auc_roc(y_orig, x_orig, w):
    y_predicted = np.dot(x_orig, w.reshape(w.size, 1))
    return ras(y_orig, y_predicted)
                    sess.run(cross_entropy,
                             feed_dict={
                                 prob: 1.0,
                                 x: xtr[i - 100:i],
                                 y_: ytr[i - 100:i]
                             }))

        feed_dict = {x: xte, y_: yte, prob: 1.0}
        ac = sess.run(acc, feed_dict=feed_dict)
        print("Acc:", ac)
        '''print('Test Acc: %g' % acc.eval())
        print('Train Acc: %g' % acc.eval(feed_dict={
                        x: xtr,
                        y_: ytr,
                        prob: 1.0}))'''

        #    y_true = sess.run(tf.argmax(y_,1), feed_dict={y_:yte, prob:1.0})
        #    y_score = sess.run(tf.argmax(y_conv,1), feed_dict={x:xte, prob:1.0})
        a = sess.run(y_, feed_dict={y_: yte, prob: 1.0})
        b = sess.run(y_conv, feed_dict={x: xte, prob: 1.0})
        rass = ras(a, b)
        print('AUC(ROC):', rass)
        plot.append([ac, ras])
#        ka.append(sess.run(acc, feed_dict={x:xtr, y_:ytr, prob:1.0}))
#        kb.append(sess.run(acc, feed_dict={x:xte, y_:yte, prob:1.0}))
#        kc.append(rass)
    writer.add_graph(sess.graph)
    saver.save(sess, 'Model1/cnn')

print('Done.')
Пример #9
0
import pandas as pd
import sys
import numpy as np
from sklearn.metrics import roc_auc_score as ras

results_file = sys.argv[1]
truth_file = sys.argv[2]
out = sys.argv[3]

results = pd.read_csv(results_file)
truth = pd.read_csv(truth_file)

cols = np.array(truth.columns[1:])

scores = np.empty(6)

for i in range(0, 6):
    scores[i] = ras(np.array(truth[cols[i]]), np.array(results[cols[i]]))

avg_score = ras(truth[cols], results[cols], average='macro')

f = open(out, 'w')
f.write('Average AUC score: ' + str(avg_score) + '\n')
f.write('Scores by Event:\n')
f.write('{0:>20} {1:>20} {2:>20} {3:>20} {4:>20} {5:>20}'.format(
    cols[0], cols[1], cols[2], cols[3], cols[4], cols[5]))
f.write('\n')
f.write('{0:20.5} {1:20.5} {2:20.5} {3:20.5} {4:20.5} {5:20.5}'.format(
    scores[0], scores[1], scores[2], scores[3], scores[4], scores[5]))
# Use first the file data_adjsutments.py to work with the variables.

# Now lets optimize the model
from sklearn.ensemble import RandomForestRegressor as rfr
from sklearn.metrics import roc_auc_score as ras

#First, number of estimators
results = []

n_estimator_option = [100, 250, 300, 700]

for trees in n_estimator_option:
    model = rfr(trees, oob_score=True, random_state=42)
    model.fit(X_train, Y_train)
    print trees, 'trees'
    roc = ras(Y_train, model.oob_prediction_)
    print 'c-stat:', roc
    results.append(roc)
    print ""

pd.Series(results, n_estimator_option).plot()

# n_estimator_option = 300 is the highest. Let's use it
#%%

results = []

max_feature_option = ['auto', None, "sqrt", "log2", 0.9, 0.2]

for max_feature in max_feature_option:
    model = rfr(n_estimators=300,
Пример #11
0
            sum1 = sum1 + y[i] * x1[i] * (
                1 - (1 / (1 + np.exp(-y[i] * (w1 * x1[i] + w2 * x2[i])))))
            sum2 = sum2 + y[i] * x2[i] * (
                1 - (1 / (1 + np.exp(-y[i] * (w1 * x1[i] + w2 * x2[i])))))

        w1 = w1 + k / 205 * sum1 - k * c * w1
        w2 = w2 + k / 205 * sum2 - k * c * w2
        itter += 1
    print(w1, w2, itter)

    return w1, w2


def sigma(w1, w2, x1, x2):
    p = 1 / (1 + np.exp(-w1 * x1[i] - w2 * x2[i]))
    return p


pvect = []
w1, w2 = weights(0, 0, 0.1, 10)
print(w1, w2)

for i in range(205):
    pvect.append(sigma(w1, w2, x1, x2))
#  print(sigma(w1,w2,x1,x2))

y_true = np.array(y)
y_scores = np.array(pvect)
print(ras(y_true, y_scores), 'jopa')
#print(func(w1,w2,y,x1,x2))
Пример #12
0
def create_and_predict(data, **kwargs):
    """
    kwargs: 
        neurons=32
        epochs=50
        learning_rate=0.01
        batch_size=32
        plot=False
    """
    #
    # 1) Initialize
    act = 'relu'
    architecture = [
        Dense(
            kwargs.get('neurons', 32),
            input_shape=(2, ),
            activation=act,
        ),
        Dense(
            kwargs.get('neurons', 32),
            activation=act,
        ),
        #Dense(
        #    kwargs.get('neurons',32),
        #    activation=act,),
        Dense(1, activation='sigmoid'),
    ]
    model = Sequential(architecture)
    model.compile(
        optimizer=SGD(learning_rate=kwargs.get('learning_rate', .01)),
        loss='mean_squared_error',
        metrics='accuracy',
    )
    #
    # 2) Fit
    results = model.fit(
        *data['train'],
        batch_size=kwargs.get('batch_size', 32),
        epochs=kwargs.get('epochs', 50),
        verbose=1,
        callbacks=[EarlyStopping()],
        validation_data=data['val'],
    )
    #
    # 3) return results
    results = results.history
    results['ytrue_val'] = data['val'][1]
    results['ytrue_test'] = data['test'][1]
    results['ypred_val'] = model.predict(data['val'][0])
    results['ypred_test'] = model.predict(data['test'][0])
    results['specs'] = kwargs
    #
    if kwargs.get('plot', False):
        case = 'test'
        from sklearn.linear_model import LogisticRegression as lr
        f, ax = plt.subplots(1, 3, figsize=(20, 7))
        fpr, tpr, treshold = roc_curve(results['ytrue_' + case],
                                       results['ypred_' + case])
        ax[0].plot(
            tuple(fpr),
            tuple(tpr),
            label='NN AUC ' + str(
                round(ras(results['ytrue_' + case], results['ypred_' + case]),
                      2)))
        if False:
            # Logistic Regression
            newytrue, newypred = data[case][1], lr(max_iter=5000).fit(
                *data['train']).predict_proba(data[case][0])[:, 1]
            fpr2, tpr2, treshold = roc_curve(newytrue, newypred)
            ax[0].plot(tuple(fpr2),
                       tuple(tpr2),
                       label='Logistic AUC ' +
                       str(round(ras(newytrue, newypred), 2)))
        ax[0].set_title('ROC curve')
        ax[0].legend()
        weights = {0: [], 1: []}
        for i, x in enumerate(results['ypred_' + case]):
            weights[data[case][1][i][0]] += [x[0]]
Пример #13
0
series = range(1,9)
#["hand"start","secondThing"....."Average of all"]
cols = ['HandStart','FirstDigitTouch','BothStartLoadPhase','LiftOff','Replace','BothReleased']
totalScores = np.zeros(7)
for subject in subjects:
	scores = np.zeros(7)
	print 'calculating scores for subject: ' + str(subject)
	for serie in series:
		data = pd.read_csv('SVM_results_binary_allCSP/subj%d_series%d_results.csv'%(subject, serie))
		data = np.array(data[data.columns[1:]])
		truth = pd.read_csv('input/train/subj%d_series%d_events.csv'%(subject, serie))
		truth = np.array(truth[cols])

		
		for i in range(0,6):
			scores[i] += ras(truth[:,i], data[:,i])

		scores[6] += ras(truth, data, average='macro')

	scores = np.true_divide(scores, len(series))
	totalScores += scores

	print 'Writing scores for subject: ' + str(subject)
	f = open('SVM_scores/subj%d_mean_scores.txt'%(subject), 'w')
	f.write('Average AUC score: {}\n'.format(scores[6]))
	f.write('Scores by Event:\n')
	f.write('{0:>20} {1:>20} {2:>20} {3:>20} {4:>20} {5:>20}'.format(cols[0], cols[1], cols[2], cols[3], cols[4], cols[5]))
	f.write('\n')
	f.write('{0:20.5} {1:20.5} {2:20.5} {3:20.5} {4:20.5} {5:20.5}'.format(scores[0], scores[1], scores[2], scores[3], scores[4], scores[5]))

print 'Calculating overall mean scores'
Пример #14
0
# calculates scores for a given prediction file, against the given truth file. third parameter is the output file name

import pandas as pd
import sys
import numpy as np
from sklearn.metrics import roc_auc_score as ras

results_file = sys.argv[1]
truth_file = sys.argv[2]
out = sys.argv[3]

results = pd.read_csv(results_file)
truth = pd.read_csv(truth_file)

cols = np.array(truth.columns[1:])

scores = np.empty(6)

for i in range(0,6):
	scores[i] = ras(np.array(truth[cols[i]]), np.array(results[cols[i]]))

avg_score = ras(truth[cols], results[cols], average='macro')

f=open(out, 'w')
f.write('Average AUC score: ' + str(avg_score) + '\n')
f.write('Scores by Event:\n')
f.write('{0:>20} {1:>20} {2:>20} {3:>20} {4:>20} {5:>20}'.format(cols[0], cols[1], cols[2], cols[3], cols[4], cols[5]))
f.write('\n')
f.write('{0:20.5} {1:20.5} {2:20.5} {3:20.5} {4:20.5} {5:20.5}'.format(scores[0], scores[1], scores[2], scores[3], scores[4], scores[5]))
Пример #15
0
    i = 0
    for train_index, validation_index in kf.split(X_train_norm):
        X_tra, X_val = X_train_norm.ix[train_index, :], X_train_norm.ix[
            validation_index, :]
        y_tra, y_val = y_train.ix[train_index], y_train.ix[validation_index]
        clf = clf.fit(X_tra, y_tra['Gravedad'])
        prediction = clf.predict(X_val)
        Acc.ix[i] = np.mean(np.array(y_val).T == prediction)
        feat_imp.ix[:, i] = clf.feature_importances_
        i += 1
    Acc_final_rf = np.mean(Acc)
    feat_imp = np.mean(feat_imp, axis=1)
    lista1, ordered_feat = zip(*sorted(zip(feat_imp, features), reverse=True))
    final = clf.predict(X_test_norm)
    Acc_test[j] = np.mean(final == y_test)
    AUC[j] = ras(y_test, final, average='macro')
    del clf, kf, Acc

# Final results:
auc = AUC.copy()
Acc_test.mean()
Acc_test.std()

plt.figure()
plt.plot(range(1, 101), Acc_test)
plt.plot(range(1, 101), np.tile(Acc_test.mean(), len(Acc_test)), c='r')
plt.plot(range(1, 101),
         np.tile(Acc_test.mean() + Acc_test.std(), len(Acc_test)),
         c='r',
         ls='--')
plt.plot(range(1, 101),