#temp driver #TODO create driver - controller py CSV_FILE = '~/store/fraud_data/creditcard.csv' YCOL = 'Class' logger = Logging() m = Model() p = {'c': .3, 'iters': 100} proc = Processor() lin = LogReg(p) print lin.get_parameters() #TODO make this test suite data = proc.load_csv(CSV_FILE) data = proc.normalize_col(data, 'Amount') data = data.drop(['Time'], axis=1) X = proc.get_xvals(data, YCOL) y = proc.get_yvals(data, YCOL) #print data.describe() Xu, yu = proc.under_sample(data, YCOL) Xu_train, Xu_test, yu_train, yu_test = proc.cross_validation_sets( Xu, yu, .3, 0) X_train, X_test, y_train, y_test = proc.cross_validation_sets(X, y, .3, 0) #try this with under sampled data c = lin.printing_Kfold_scores(Xu_train, yu_train) lin.logistic_regression(Xu_train, Xu_test, yu_train, yu_test, c) lin.logistic_regression(Xu_train, X_test, yu_train, y_test, c) lin.get_roc_curve(Xu_train, Xu_test, yu_train, yu_test, c) #try this with regular data c = lin.printing_Kfold_scores(X_train, y_train) lin.logistic_regression(X_train, X_test, y_train, y_test, c) lin.get_roc_curve(X_train, X_test, y_train, y_test, c)
args = parser.parse_args() #arguments for running ml suite #driver - controller.py #CSV_FILE = '~/store/fraud_data/creditcard.csv' #YCOL = 'Class' logger = Logging() m = Model() proc = Processor() #processor data = proc.load_csv(args.CSV_FILE) data = proc.normalize_col(data, 'Amount') data = data.drop(['Time'], axis=1) print data[args.YCOL].value_counts() X = proc.get_xvals(data, args.YCOL) y = proc.get_yvals(data, args.YCOL) #processor xfolds Xu, yu = proc.under_sample(data, args.YCOL) Xu_train, Xu_test, yu_train, yu_test = proc.cross_validation_sets( Xu, yu, .3, 0) X_train, X_test, y_train, y_test = proc.cross_validation_sets(X, y, .3, 0) if args.LR_DRIVE: lin = LogReg() #under sampled data c = lin.printing_Kfold_scores(Xu_train, yu_train) lin.logistic_regression(Xu_train, Xu_test, yu_train, yu_test, c) lin.logistic_regression(Xu_train, X_test, yu_train, y_test, c) lin.get_roc_curve(Xu_train, Xu_test, yu_train, yu_test, c)
def main(_): # Import data CSV_FILE = '~/store/fraud_data/creditcard.csv' YCOL = 'Class' logger = Logging() proc = Processor() #TODO make this test suite data = proc.load_csv(CSV_FILE) data = proc.normalize_col(data, 'Amount') data = data.drop(['Time'], axis=1) X = proc.get_xvals(data, YCOL) y = proc.get_yvals(data, YCOL) #print data.describe() Xu, yu = proc.under_sample(data, YCOL) Xu_train, Xu_test, yu_train, yu_test = proc.cross_validation_sets( Xu, yu, .3, 0) X_train, X_test, y_train, y_test = proc.cross_validation_sets(X, y, .3, 0) x = tf.placeholder(tf.float32, [None, 29]) W = tf.Variable(tf.zeros([29, 1])) b = tf.Variable(tf.zeros([1])) y = tf.matmul(x, W) + b # Define loss and optimizer y_ = tf.placeholder(tf.float32, [None, 1]) # The raw formulation of cross-entropy, # # tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(tf.nn.softmax(y)), # ) reduction_indices=[1])) # # can be numerically unstable. # # So here we use tf.nn.softmax_cross_entropy_with_logits on the raw # outputs of 'y', and then average across the batch. #cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y)) #cross_entropy = -tf.reduce_sum(y_*tf.log(tf.clip_by_value(y,1e-10,1.0))) cross_entropy = tf.reduce_sum(tf.square(tf.subtract(y_, y))) train_step = tf.train.GradientDescentOptimizer(0.1).minimize(cross_entropy) sess = tf.InteractiveSession() tf.global_variables_initializer().run() # Train y_test = y_test.as_matrix() for i in range(20): #batch_xs, batch_ys = mnist.train.next_batch(100) #batch_xs = X_train #batch_ys = y_train.as_matrix() sess.run(train_step, feed_dict={x: X_train, y_: y_train.as_matrix()}) # Test trained model print("[model] training is complete ***************** ") correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) accuracy = tf.reduce_mean( tf.subtract(tf.cast(correct_prediction, tf.float32), y_test[:10000])) print('accuracy: %s' % sess.run(accuracy, feed_dict={ x: X_test.head(10000), y_: y_test[:10000] })) #cp = sess.run(tf.cast(correct_prediction, tf.float32), feed_dict={x: X_test.head(10000), y_: y_test[:10000]}) #lacc = tf.subtract(tf.cast(correct_prediction, tf.float32), y_test[:10000]) #cp = sess.run(lacc, feed_dict={x: X_test.head(10000), y_ : y_test[:10000]}) #count = 0 #for idx, c in enumerate(cp): #if c != y_test[idx]: ##print(idx, c, y_test[idx]) #continue #else: #count +=1 #print((count/float(10000))) sess.close()