Пример #1
0
    #TODO create driver - controller py
    CSV_FILE = '~/store/fraud_data/creditcard.csv'
    YCOL = 'Class'
    logger = Logging()
    m = Model()
    p = {'c': .3, 'iters': 100}
    proc = Processor()
    lin = LogReg(p)
    print lin.get_parameters()

    #TODO make this test suite
    data = proc.load_csv(CSV_FILE)
    data = proc.normalize_col(data, 'Amount')
    data = data.drop(['Time'], axis=1)
    X = proc.get_xvals(data, YCOL)
    y = proc.get_yvals(data, YCOL)
    #print data.describe()
    Xu, yu = proc.under_sample(data, YCOL)
    Xu_train, Xu_test, yu_train, yu_test = proc.cross_validation_sets(
        Xu, yu, .3, 0)
    X_train, X_test, y_train, y_test = proc.cross_validation_sets(X, y, .3, 0)
    #try this with under sampled data
    c = lin.printing_Kfold_scores(Xu_train, yu_train)
    lin.logistic_regression(Xu_train, Xu_test, yu_train, yu_test, c)
    lin.logistic_regression(Xu_train, X_test, yu_train, y_test, c)
    lin.get_roc_curve(Xu_train, Xu_test, yu_train, yu_test, c)
    #try this with regular data
    c = lin.printing_Kfold_scores(X_train, y_train)
    lin.logistic_regression(X_train, X_test, y_train, y_test, c)
    lin.get_roc_curve(X_train, X_test, y_train, y_test, c)
Пример #2
0
    #arguments for running ml suite

    #driver - controller.py
    #CSV_FILE = '~/store/fraud_data/creditcard.csv'
    #YCOL = 'Class'
    logger = Logging()
    m = Model()
    proc = Processor()

    #processor
    data = proc.load_csv(args.CSV_FILE)
    data = proc.normalize_col(data, 'Amount')
    data = data.drop(['Time'], axis=1)
    print data[args.YCOL].value_counts()
    X = proc.get_xvals(data, args.YCOL)
    y = proc.get_yvals(data, args.YCOL)

    #processor xfolds
    Xu, yu = proc.under_sample(data, args.YCOL)
    Xu_train, Xu_test, yu_train, yu_test = proc.cross_validation_sets(
        Xu, yu, .3, 0)
    X_train, X_test, y_train, y_test = proc.cross_validation_sets(X, y, .3, 0)

    if args.LR_DRIVE:
        lin = LogReg()
        #under sampled data
        c = lin.printing_Kfold_scores(Xu_train, yu_train)
        lin.logistic_regression(Xu_train, Xu_test, yu_train, yu_test, c)
        lin.logistic_regression(Xu_train, X_test, yu_train, y_test, c)
        lin.get_roc_curve(Xu_train, Xu_test, yu_train, yu_test, c)
        #regular data
Пример #3
0
def main(_):
    # Import data
    CSV_FILE = '~/store/fraud_data/creditcard.csv'
    YCOL = 'Class'
    logger = Logging()
    proc = Processor()

    #TODO make this test suite
    data = proc.load_csv(CSV_FILE)
    data = proc.normalize_col(data, 'Amount')
    data = data.drop(['Time'], axis=1)
    X = proc.get_xvals(data, YCOL)
    y = proc.get_yvals(data, YCOL)
    #print data.describe()
    Xu, yu = proc.under_sample(data, YCOL)
    Xu_train, Xu_test, yu_train, yu_test = proc.cross_validation_sets(
        Xu, yu, .3, 0)
    X_train, X_test, y_train, y_test = proc.cross_validation_sets(X, y, .3, 0)
    x = tf.placeholder(tf.float32, [None, 29])
    W = tf.Variable(tf.zeros([29, 1]))
    b = tf.Variable(tf.zeros([1]))
    y = tf.matmul(x, W) + b

    # Define loss and optimizer
    y_ = tf.placeholder(tf.float32, [None, 1])

    # The raw formulation of cross-entropy,
    #
    #   tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(tf.nn.softmax(y)),
    #                             )    reduction_indices=[1]))
    #
    # can be numerically unstable.
    #
    # So here we use tf.nn.softmax_cross_entropy_with_logits on the raw
    # outputs of 'y', and then average across the batch.
    #cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))

    #cross_entropy = -tf.reduce_sum(y_*tf.log(tf.clip_by_value(y,1e-10,1.0)))
    cross_entropy = tf.reduce_sum(tf.square(tf.subtract(y_, y)))
    train_step = tf.train.GradientDescentOptimizer(0.1).minimize(cross_entropy)

    sess = tf.InteractiveSession()
    tf.global_variables_initializer().run()
    # Train
    y_test = y_test.as_matrix()
    for i in range(20):
        #batch_xs, batch_ys = mnist.train.next_batch(100)
        #batch_xs = X_train
        #batch_ys = y_train.as_matrix()
        sess.run(train_step, feed_dict={x: X_train, y_: y_train.as_matrix()})
        # Test trained model
        print("[model] training is complete ***************** ")
        correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
        accuracy = tf.reduce_mean(
            tf.subtract(tf.cast(correct_prediction, tf.float32),
                        y_test[:10000]))

        print('accuracy: %s' % sess.run(accuracy,
                                        feed_dict={
                                            x: X_test.head(10000),
                                            y_: y_test[:10000]
                                        }))
    #cp = sess.run(tf.cast(correct_prediction, tf.float32), feed_dict={x: X_test.head(10000), y_: y_test[:10000]})
    #lacc = tf.subtract(tf.cast(correct_prediction, tf.float32), y_test[:10000])
    #cp = sess.run(lacc, feed_dict={x: X_test.head(10000), y_ : y_test[:10000]})
    #count = 0
    #for idx, c in enumerate(cp):
    #if c != y_test[idx]:
    ##print(idx, c, y_test[idx])
    #continue
    #else:
    #count +=1
    #print((count/float(10000)))
    sess.close()