Пример #1
0
def command_line_run(args):
    args_dict = {}
    for i in range(1, len(args)):
        if '-' in args[i]:
            args_dict[args[i]] = []
            args_dict[-1] = args_dict[args[i]]
        else:
            args_dict[-1].append(float(args[i]))
    del args_dict[-1]

    num_classes = 10
    random.seed(1917)

    if '-debug' in args_dict:
        train_outputs = import_csv(TRAIN_OUTPUTS_SUBSET_PATH).astype(int)
        train_inputs = import_csv(TRAIN_INPUTS_SUBSET_PATH)
    else:
        train_outputs = import_csv(TRAIN_OUTPUTS_PATH).astype(int)
        train_inputs = import_csv(TRAIN_INPUTS_PATH)

    if '-t' in args_dict:
        print len(train_inputs)
        train_inputs = np.array(transform_features(train_inputs))
        print len(train_inputs)

    # Default values.
    hnh = []
    num_features = 300
    dropout = None
    lr = 1.0
    epochs = 50

    if '-f' in args_dict:
        num_features = map(int, args_dict['-f'])[0]

    if '-test' in args_dict:
        test_inputs = import_csv(TEST_INPUTS_PATH)

        if '-t' in args_dict:
            test_inputs = transform_features(test_inputs)

        if not num_features == len(train_inputs[0]):
            alll = feature_reduce(
                np.array(list(train_inputs) + list(test_inputs)), num_features)
            train_inputs = alll[:len(train_inputs)]
            test_inputs = alll[len(train_inputs):]

    if '-validate' in args_dict:
        validation_size = (4 * len(train_inputs)) / 5

        # Randomize the train and validation set.
        rand_idxs = random.sample(range(0, len(train_inputs)),
                                  len(train_inputs))

        test_inputs = train_inputs[rand_idxs[validation_size:]]
        test_outputs = train_outputs[rand_idxs[validation_size:]]
        train_inputs = train_inputs[rand_idxs[0:validation_size]]
        train_outputs = train_outputs[rand_idxs[0:validation_size]]

        # We have to reduce the features all at the same time because it is unsupervised learning and
        # we want the same features to be picked by PCA for both of the train and test sets.
        if not num_features == len(train_inputs[0]):
            alll = feature_reduce(
                np.array(list(train_inputs) + list(test_inputs)), num_features)
            train_inputs = alll[:len(train_inputs)]
            test_inputs = alll[len(train_inputs):]

    if '-hn' in args_dict:
        hnh = map(int, args_dict['-hn'])

    if '-d' in args_dict:
        if not (0.0 <= args_dict['-d'][0] <= 1.0):
            print 'Please input a dropout rate between 0 and 1!'
            exit(0)
        dropout = args_dict['-d'][0]

    if '-lr' in args_dict:
        lr = args_dict['-lr'][0]

    if '-e' in args_dict:
        epochs = int(args_dict['-e'][0])

    nn = NeuralNetwork(len(train_inputs[0]),
                       hnh,
                       num_classes,
                       learning_rate=lr,
                       dropout=dropout)
    nn.fit(train_inputs, train_outputs, training_horizon=epochs, verbose=True)
    p = nn.predict(test_inputs)

    fname = data_files_path + 'predictions_with_%depochs_%dfeatures_%0.2flf' % (
        epochs, num_features, lr)
    if '-test' in args_dict:
        with open(fname + '.csv', 'w') as f:
            f.write('Id,Prediction\n')
            for i in range(len(p)):
                f.write('%d,%d\n' % (i + 1, p[i]))
    else:
        print accuracy(p, test_outputs)
        if '-record' in args_dict:
            heatmap(p, test_outputs, fname)
def command_line_run(args):
    args_dict = {}
    for i in range(1,len(args)):
        if '-' in args[i]:
            args_dict[args[i]] = []
            args_dict[-1] = args_dict[args[i]]
        else:
            args_dict[-1].append(float(args[i]))
    del args_dict[-1]

    num_classes = 10
    random.seed(1917)

    if '-debug' in args_dict:
        train_outputs = import_csv(TRAIN_OUTPUTS_SUBSET_PATH).astype(int)
        train_inputs = import_csv(TRAIN_INPUTS_SUBSET_PATH)
    else:
        train_outputs = import_csv(TRAIN_OUTPUTS_PATH).astype(int)
        train_inputs = import_csv(TRAIN_INPUTS_PATH)
    
    if '-t' in args_dict:
        print len(train_inputs)
        train_inputs = np.array(transform_features(train_inputs))
        print len(train_inputs)

    # Default values.
    hnh = []
    num_features = 300
    dropout = None
    lr = 1.0
    epochs = 50

    if '-f' in args_dict:
        num_features = map(int, args_dict['-f'])[0]

    if '-test' in args_dict:
        test_inputs = import_csv(TEST_INPUTS_PATH)

        if '-t' in args_dict:
            test_inputs = transform_features(test_inputs)
 
        if not num_features == len(train_inputs[0]):
            alll = feature_reduce(np.array(list(train_inputs)+list(test_inputs)), num_features)
            train_inputs = alll[: len(train_inputs)]
            test_inputs = alll[len(train_inputs) :]

    if '-validate' in args_dict:
        validation_size = (4 * len(train_inputs)) / 5

        # Randomize the train and validation set.
        rand_idxs = random.sample(range(0, len(train_inputs)), len(train_inputs))

        test_inputs = train_inputs[rand_idxs[validation_size : ]]
        test_outputs = train_outputs[rand_idxs[validation_size : ]]
        train_inputs = train_inputs[rand_idxs[0 : validation_size]]
        train_outputs = train_outputs[rand_idxs[0 : validation_size]]

        # We have to reduce the features all at the same time because it is unsupervised learning and
        # we want the same features to be picked by PCA for both of the train and test sets.
        if not num_features == len(train_inputs[0]):
            alll = feature_reduce(np.array(list(train_inputs)+list(test_inputs)), num_features)
            train_inputs = alll[: len(train_inputs)]
            test_inputs = alll[len(train_inputs) :]

    if '-hn' in args_dict:
        hnh = map(int, args_dict['-hn'])

    if '-d' in args_dict:
        if not (0.0 <= args_dict['-d'][0] <= 1.0):
            print 'Please input a dropout rate between 0 and 1!'
            exit(0)
        dropout = args_dict['-d'][0]

    if '-lr' in args_dict:
        lr = args_dict['-lr'][0]

    if '-e' in args_dict:
        epochs = int(args_dict['-e'][0])

    nn = NeuralNetwork(len(train_inputs[0]), hnh, num_classes, learning_rate=lr, dropout=dropout)
    nn.fit(train_inputs, train_outputs, training_horizon=epochs, verbose=True)
    p = nn.predict(test_inputs)

    fname = data_files_path+'predictions_with_%depochs_%dfeatures_%0.2flf'%(epochs,num_features,lr)
    if '-test' in args_dict:
        with open(fname+'.csv','w') as f:
            f.write('Id,Prediction\n')
            for i in range(len(p)):
                f.write('%d,%d\n'%(i+1,p[i]))
    else:
        print accuracy(p, test_outputs)
        if '-record' in args_dict:
            heatmap(p, test_outputs, fname)
    # grid score
    print ("Grid scores on development set:")
    print ()
    for params, mean_score, scores in clf.grid_scores_:
        print ("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() * 2, params))
    print ()

    # validation
    print ("Performance of optimal learner on validation set")
    print ()
    expected, predicted = test_y, clf.predict(test_x)
    print (classification_report(expected, predicted))
    print (confusion_matrix(expected, predicted))
    print ()
    accuracy(predicted, expected)
    heatmap(predicted, expected, "LinSVM/testAccuracy")

    # training accuracy
    print ("Performance of optimal learner on training set")
    print ()
    expected, predicted = train_y, clf.predict(train_x)
    print (classification_report(expected, predicted))
    print (confusion_matrix(expected, predicted))
    print ()
    accuracy(predicted, expected)
    heatmap(predicted, expected, "LinSVM/trainAccuracy")

    # training on the whole dataset
    print ("fitting the best estimator on the complete training set")
    learner = clf.best_estimator_
    learner.fit(train_inputs, train_outputs)
    print("Grid scores on development set:")
    print()
    for params, mean_score, scores in clf.grid_scores_:
        print("%0.3f (+/-%0.03f) for %r" %
              (mean_score, scores.std() * 2, params))
    print()

    #validation
    print("Performance of optimal learner on validation set")
    print()
    expected, predicted = test_y, clf.predict(test_x)
    print(classification_report(expected, predicted))
    print(confusion_matrix(expected, predicted))
    print()
    accuracy(predicted, expected)
    heatmap(predicted, expected, 'LogReg/testAccuracy')

    #training accuracy
    print("Performance of optimal learner on training set")
    print()
    expected, predicted = train_y, clf.predict(train_x)
    print(classification_report(expected, predicted))
    print(confusion_matrix(expected, predicted))
    print()
    accuracy(predicted, expected)
    heatmap(predicted, expected, 'LogReg/trainAccuracy')

    #training on the whole dataset
    print("fitting the best estimator on the complete training set")
    learner = clf.best_estimator_
    learner.fit(train_inputs, train_outputs)
    #grid score
    print("Grid scores on development set:")
    print()
    for params, mean_score, scores in clf.grid_scores_:
        print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() * 2, params))
    print()
   
    #validation
    print("Performance of optimal learner on validation set")
    print()
    expected, predicted = test_y, clf.predict(test_x)     
    print(classification_report(expected, predicted))
    print(confusion_matrix(expected, predicted))
    print()
    accuracy(predicted,expected)
    heatmap(predicted,expected,'LogReg/testAccuracy')

    #training accuracy
    print("Performance of optimal learner on training set")
    print()
    expected, predicted = train_y, clf.predict(train_x)     
    print(classification_report(expected, predicted))
    print(confusion_matrix(expected, predicted))
    print()
    accuracy(predicted,expected)
    heatmap(predicted,expected,'LogReg/trainAccuracy')
    
    #training on the whole dataset
    print("fitting the best estimator on the complete training set")
    learner=clf.best_estimator_
    learner.fit(train_inputs,train_outputs)