Exemplo n.º 1
0
    def _validate_model(self, x: np.ndarray, y: np.ndarray, validation_file_name: str = "validation.json") -> dict:
        logging.info("Creating predictions ...")
        y_predicted_categories = self._model.predict(x, batch_size=self._batch_size)
        gc.collect()

        from sklearn.metrics.classification import accuracy_score, precision_recall_fscore_support
        y_expected_1dim = self._label_enc.max_category(y)
        y_predicted_1dim = self._label_enc.max_category(y_predicted_categories)
        logging.info("Results:")
        logging.info("{}".format(precision_recall_fscore_support(y_true=y_expected_1dim, y_pred=y_predicted_1dim)))
        accuracy = accuracy_score(y_true=y_expected_1dim, y_pred=y_predicted_1dim)
        logging.info("{}".format(accuracy))

        from sklearn.metrics.classification import classification_report
        logging.info("\n{}".format(classification_report(y_true=y_expected_1dim,
                                                         y_pred=y_predicted_1dim,
                                                         target_names=["neg", "pos"],
                                                         )))

        results = classification_report(y_true=y_expected_1dim,
                                        y_pred=y_predicted_1dim,
                                        target_names=["neg", "pos"],
                                        output_dict=True)
        results["accuracy"] = accuracy
        write_text_file(
            file_path=self._experiment_folder / validation_file_name,
            text=json.dumps(results))

        return results
Exemplo n.º 2
0
def ensemble_test():
    file_src_dict = {'embedding_file': './data/word_embedding.pkl'}
    with open(file_src_dict['embedding_file'],
              'rb') as f:  # embedding is a 2d-list with size :vocab_size*dim
        embeddings = pickle.load(f)
    with open('./data/val.pkl',
              'rb') as f:  # embedding is a 2d-list with size :vocab_size*dim
        val_q, val_r, val_label = pickle.load(f)
    all_pred_score = []
    all_pred_label = []
    models = ['./base/model.1']
    model_name = ['base']
    graphs = [tf.Graph() for i in range(0, len(models))]
    for i in range(0, len(graphs)):
        with graphs[i].as_default():
            arnn = ARNN(embedding=embeddings)
            if model_name[i] == 'base':
                arnn.build_base_model(training=False)
            pred_label, pred_score = arnn.predict(model_path=models[i],
                                                  data_q=val_q,
                                                  data_r=val_r)
            all_pred_score.append(pred_score)
            all_pred_label.append(pred_label)
            del arnn
    final_score = (all_pred_score[0] + all_pred_score[1])
    for i in range(2, len(all_pred_score)):
        final_score += all_pred_score[i]
    final_score /= len(all_pred_score)
    final_label = [int(s[1] > s[0]) for s in final_score]
    if val_label is not None:
        for pred_l in all_pred_label:
            print(classification_report(val_label, pred_l))
        print(classification_report(val_label, final_label))
Exemplo n.º 3
0
def eval(model, cfg, mode='val', cuda=True):
    data_info = cfg.dataset[mode]
    data_reader = DataReader(
        ann_files=[data_info['ann_file']],
        img_dirs=[data_info['img_prefix']],
        transform=None,
        mode='val',
        img_scale=data_info['img_scale'],
        keep_ratio=data_info['keep_ratio'],
    )
    data_loader = DataLoader(data_reader,
                             collate_fn=collate_fn,
                             **cfg.val_data_loader)
    y_true, y_pred = [], []
    model.eval()
    for step, (data, target) in tqdm(enumerate(data_loader)):
        # inputs = torch.stack(data)
        # target = torch.from_numpy(np.array(target)).type(torch.LongTensor)
        inputs = data
        targets = target
        if cuda:
            inputs = inputs.cuda()
            targets = targets.cuda()
        with torch.no_grad():
            outputs = model(inputs)
        outs = nn.functional.softmax(outputs, dim=1)
        pred = torch.argmax(outs, dim=1)
        y_true.extend(list(targets.cpu().detach().numpy()))
        y_pred.extend(list(pred.cpu().detach().numpy()))
    model.train()
    return classification_report(y_true, y_pred, output_dict=True), \
           classification_report(y_true, y_pred, output_dict=False)
    def evalulate_on_cache(self):
        grapheme_logits_all = np.vstack(self.grapheme_logits_cache)
        vowel_logits_all = np.vstack(self.vowel_logits_cache)
        consonant_logits_all = np.vstack(self.consonant_logits_cache)
        labels_all = np.vstack(self.labels_cache)

        grapheme_preds = np.argmax(grapheme_logits_all, axis=1)
        vowels_preds = np.argmax(vowel_logits_all, axis=1)
        consonant_preds = np.argmax(consonant_logits_all, axis=1)

        grapheme_clf_result = classification_report(labels_all[:, 0], grapheme_preds, output_dict=True)
        vowels_clf_result = classification_report(labels_all[:, 1], vowels_preds, output_dict=True)
        consonant_clf_result = classification_report(labels_all[:, 2], consonant_preds, output_dict=True)
        kaggle_score = (grapheme_clf_result['macro avg']['recall'] * 2 + vowels_clf_result['macro avg']['recall'] +
                        consonant_clf_result['macro avg']['recall']) / 4


        acc = np.mean(self.acc_cache)
        loss = np.mean(self.loss_cache)


        result = {
            'grapheme_clf_result': grapheme_clf_result,
            'vowels_clf_result': vowels_clf_result,
            'consonant_clf_result': consonant_clf_result,
            'kaggle_score': kaggle_score,
            #'preds_labels': preds_labels,
            'acc': acc,
            'loss': loss

        }
        return result
Exemplo n.º 5
0
 def evaluate(self, y_predict, y_true, target_names=None):
     if self.config.task_type == 'classification':
         classification_report(y_true=y_true,
                               y_pred=y_predict,
                               target_names=target_names)
     elif self.config.task_type == 'ranking':
         roc_auc_score(y_true=y_true, y_score=y_predict)
     elif self.config.task_type == 'regression':
         r2_score(y_true=y_true, y_pred=y_predict)
Exemplo n.º 6
0
def eval(test_y, sys_y, pipeline_name, data_dir):
    print(classification_report(test_y, sys_y))
    report = classification_report(test_y, sys_y, output_dict=True)
    df = pd.DataFrame(report).transpose()

    dir = data_dir.split('/')[-2]
    df.to_csv(f'../results/{dir}/{pipeline_name}.csv', index=False)

    return report
Exemplo n.º 7
0
def get_f1_scores(inp_train, out_train):
    inp_test, out_test = init_test_dataframes()
    models = [
        AdaBoostClassifier(n_estimators=300, learning_rate=0.19, algorithm='SAMME.R'),
        LogisticRegression(random_state=0, class_weight='balanced', solver='lbfgs', multi_class='ovr', penalty='l2'),
        ]
        
    for model in models:
        model.fit(inp_train, out_train)
        print str(model).split('(')[0]
        print classification_report(out_test, model.predict(inp_test))
Exemplo n.º 8
0
 def testforest(self, test, testlabel,forest):
     outputtest= forest.predict(test) 
     accuracytrain = accuracy_score(testlabel, outputtest)
     print "The size of the test set is"
     print  np.shape(test)
     print "The accuracy for the test set is %r" %accuracytrain, "and the confusion matrix is"
     #print confusion_matrix(outputtest,testlabel)
     print classification_report(testlabel, outputtest)
     # generate probability
     outputproba=forest.predict_proba(test)
     outperfor={'prob0':outputproba[:,0],'prob1':outputproba[:,1],'output':outputtest,'target':testlabel}
     outframe=DataFrame(outperfor)
     print outframe
     #outframe.to_csv(r'D:\allprob.csv', header=0)
     return accuracytrain, outframe
Exemplo n.º 9
0
 def evaluate_val_for_train(self, sess, data):
     val_q, val_r, val_labels = data
     all_pred_label = []
     low = 0
     batch_size_for_val=300
     while True:
         n_sample = min(low + batch_size_for_val, len(val_labels)) - low
         batch_q_len = self.get_sequences_length(val_q[low:low + n_sample], maxlen=self.max_sentence_len)
         batch_q = pad_sequences(val_q[low:low + n_sample], padding='post')
         batch_r_len = self.get_sequences_length(val_r[low:low + n_sample], maxlen=self.max_sentence_len)
         batch_r = pad_sequences(val_r[low:low + n_sample], padding='post')
         feed_dict = {
             self.input_q: np.array(batch_q),
             self.q_sequence_len: np.array(batch_q_len),
             self.input_r: np.array(batch_r),
             self.r_sequence_len: np.array(batch_r_len),
             self.input_y: np.array(val_labels[low:low + n_sample]),
             self.keep_prob:1.0
         }
         pred_label,loss = sess.run([self.class_label_pred,self.total_loss], feed_dict=feed_dict)
         all_pred_label.append(pred_label)
         low = low + batch_size_for_val
         if low >= len(val_labels):
             break
     all_pred_label = np.concatenate(all_pred_label, axis=0)
     return loss,classification_report(val_labels,all_pred_label)
Exemplo n.º 10
0
 def print_cummulative_clf_report(self):
     """
     Prints the cumulated classification report for all added runs
     :return:
     """
     log.info('Classification report:\n\n' + classification_report(
         self.y_true, self.y_pred, target_names=['Benign', 'Malicious']))
Exemplo n.º 11
0
 def test(self, input_xy, target_names=None, **kwargs):
     xs, trues = zip(*input_xy)
     xs = np.stack(xs).reshape(-1, reduce(operator.mul, xs[0].shape))
     trues = np.stack(trues)
     results = self.predict(xs)
     print(classification_report(trues, results, target_names=target_names))
     return f1_score(trues, results, average="micro")
def test_model(classifier, X, y):
    y_pred = classifier.predict(X)
    conf_matrix = confusion_matrix(y, y_pred)
    accuracy = accuracy_score(y, y_pred)
    report = classification_report(y, y_pred)
    print(conf_matrix)
    print(report)
    print(accuracy)
Exemplo n.º 13
0
    def _mod_report(self, mod):
        report = classification_report(self.y_test, mod.predict(self.x_test))
        train_auc = roc_auc_score(self.y_train,
                                  mod.predict_proba(self.x_train)[:, 1])
        test_auc = roc_auc_score(self.y_test,
                                 mod.predict_proba(self.x_test)[:, 1])

        return report, train_auc, test_auc
Exemplo n.º 14
0
 def predict(self, X, trees, y_test): 
     
     reports = []
     
     for tree in trees:
         y_predict = tree.predict(X)
         reports.append(classification_report(y_predict, y_test))
         
     return reports    
Exemplo n.º 15
0
 def run(self):
     train_X, train_y, dev_X, dev_y = self.extract_train_dev_data()
     if not self.dev_stage:
         test_X, test_y = self.extract_test_data()
     for c in self.classifiers:
         logger.info("   Classifier: {}".format(c.name))
         c.train(train_X, train_y)
         if self.dev_stage:
             logger.info("   Running classifier on dev data")
             pred_y = c.predict(dev_X)
             print('dev labels')
             print(set(train_y))
             logger.info(classification_report(dev_y, pred_y))
         else:
             logger.info("   Running classifier on test data")
             pred_y = c.predict(test_X)
             print('test labels')
             print(set(train_y))
             logger.info(classification_report(test_y, pred_y))
Exemplo n.º 16
0
 def test(self, input_xy, target_names=None, **kwargs):
     input_x = []
     trues = []
     for x, y in input_xy:
         input_x.append(x)
         trues.append(y.argmax())
     results = self.predict(input_x, prob=False)
     trues = np.array(trues)
     print(classification_report(trues, results, target_names=target_names))
     return f1_score(trues, results, average="micro")
Exemplo n.º 17
0
 def compute(self, k=10):
     self.datasets = (X, y) = self.transformer(self.origin_datasets)
     kf = KFold(n_splits=10, shuffle=True)
     self.clfs = []
     for train_index, test_index in kf.split(X):
         X_train, X_test = X[train_index], X[test_index]
         y_train, y_test = y[train_index], y[test_index]
         clf = self.clf_type()
         clf.fit(X_train, y_train)
         y_pred = clf.predict(X_test)
         print(classification_report(y_test, y_pred))
    def test_classification(self, test, testlabel,bestmodel):
#        bestmodel=bestmodel
        outputtest = bestmodel.predict(test)
        accuracytest = accuracy_score(testlabel, outputtest)
        print ("The accuracy for the test set is %r" %accuracytest, "and the confusion matrix is")
        print (confusion_matrix(outputtest,testlabel))
        print( classification_report(testlabel, outputtest))
#        probaout=bestmodel.predict_prob(test)
#       probaout= DataFrame(probaout)
#        print probaout
        return outputtest
Exemplo n.º 19
0
    def calc_metrics(self, pred_labels, true_labels):

        from sklearn.metrics.classification import classification_report
        print(classification_report(true_labels, pred_labels))

        acc = accuracy_score(true_labels, pred_labels)
        precision = precision_score(true_labels, pred_labels, average='macro')
        recall = recall_score(true_labels, pred_labels, average='macro')
        f1 = f1_score(true_labels, pred_labels, average='macro')

        return {'acc': acc, 'p': precision, 'r': recall, 'f1': f1}
def random_forrest(dir_models, ticket, x, x_test, y, y_test):
    print('getting model...RandomForest')
    clf = RandomForestClassifier(verbose=False, warm_start=True)
    print('training...')
    clf.fit(x, y)
    print('predicting...')
    predicted = clf.predict(x_test)
    print(classification_report(y_test, predicted))
    id = len(os.listdir(dir_models))
    joblib.dump(clf,
                dir_models + ticket + '_random_forrest_' + str(id) + '.pkl')
    return clf.score(x_test, y_test)
Exemplo n.º 21
0
 def evaluate(self, inputs: list, labels: list):
     predictions = []
     labels_max = []
     for j in range(0, len(inputs)):
         input_excample = inputs[j]
         data_label = labels[j]
         nn_output = self._nn.predict(input_excample)
         predicted_index = np.argmax(nn_output)
         predictions.append(predicted_index)
         label_index = np.argmax(data_label)
         labels_max.append(label_index)
     return classification_report(labels_max, predictions, output_dict=False)
 def test_classification(self, test, testlabel, bestmodel):
     #        bestmodel=bestmodel
     outputtest = bestmodel.predict(test)
     accuracytest = accuracy_score(testlabel, outputtest)
     print("The accuracy for the test set is %r" % accuracytest,
           "and the confusion matrix is")
     print(confusion_matrix(outputtest, testlabel))
     print(classification_report(testlabel, outputtest))
     #        probaout=bestmodel.predict_prob(test)
     #       probaout= DataFrame(probaout)
     #        print probaout
     return outputtest
Exemplo n.º 23
0
def compare():
    with open('./output.txt', 'r', encoding='utf-8') as f:
        pred_label = []
        for line in f:
            arr = line.strip('\n').split('\t')
            pred_label.append(int(arr[1]))
    with open('./data/test.csv', 'r', encoding='utf-8') as f:
        label = []
        for line in f:
            lineno, sen1, sen2, tmp = line.strip().split('\t')
            label.append(int(tmp))
    print(classification_report(label, pred_label))
Exemplo n.º 24
0
    def evaluate(self,sess,vocab,token_indices, character_indices_padded, token_lengths, pattern ,label_indices,datatype='train'):
        all_predictions = []
        all_y_true = []
        # output_filepath = os.path.join(stats_graph_folder, '{1:03d}_{0}.txt'.format(dataset_type,epoch_number))
        # output_file = codecs.open(output_filepath, 'w', 'UTF-8')
        # original_conll_file = codecs.open(dataset_filepaths[dataset_type], 'r', 'UTF-8')

        for i in range(len(token_indices)):
            feed_dict = {
                self.input_token_indices: token_indices[i],
                self.input_token_character_indices: character_indices_padded[i],
                self.input_token_lengths: token_lengths[i],
                self.input_token_patterns: pattern[i],
                self.dropout_keep_prob: 1.,
            }
            unary_scores, transition_params_trained = sess.run([self.unary_scores, self.transition_parameters],
                                                               feed_dict)

            predictions, _ = tf.contrib.crf.viterbi_decode(unary_scores, transition_params_trained)
            predictions = predictions[1:-1]

            assert (len(predictions) == len(token_indices[i]))

            all_predictions.extend(predictions)
            all_y_true.extend(label_indices[i])

        label_predict = [vocab.labels[i] for i in all_predictions]
        label_true = [vocab.labels[i] for i in all_y_true]

        label_predict = utils_nlp.bioes_to_bio(label_predict)
        label_true = utils_nlp.bioes_to_bio(label_true)

        new_pre = []
        new_true = []
        for i in range(len(label_predict)):
            if label_true[i]!='O' or label_predict[i]!='O':
                new_pre.append(utils_nlp.remove_bio_from_label_name(label_predict[i]))
                new_true.append(label_true[i] if label_true[i]=='O' else label_true[i][2:])
        labels = [label if label=='O' else label[2:] for label in vocab.labels]
        labels = list(set(labels))
        report = classification_report(new_true,new_pre)

        print('matrix')
        matrix  = confusion_matrix(new_true,new_pre,labels)
        file =codecs.open(datatype+'_evaluate.txt','w','utf-8')
        file.writelines(' '.join(labels)+'\n\r')
        for i,row in enumerate(matrix):
            file.writelines(' '.join([str(i) for i in row])+'\n\r')
        file.close()

        print(matrix)
        print(report)
        return report
def decision_tree(dir_models, ticket, x, x_test, y, y_test):
    print('getting model...decision tree')
    clf = DecisionTreeClassifier()
    print('training...')
    clf.fit(x, y)
    print('predicting...')
    predicted = clf.predict(x_test)
    print(classification_report(y_test, predicted))
    id = len(os.listdir(dir_models))
    joblib.dump(clf,
                dir_models + ticket + '_decission_tree_' + str(id) + '.pkl')

    return clf.score(x_test, y_test)
Exemplo n.º 26
0
def metrics(y_true, y_pred):

    y_final = []
    for i in range(y_pred.shape[0]):
        if y_pred[i][0] > y_pred[i][1]:
            y_final.append(int(0))
        else:
            y_final.append(int(1))
    print(y_final)
    classify_report = classification.classification_report(y_true, y_final)

    print('classify_report : \n', classify_report)

    return 0
def voting_random_forrest(dir_models, ticket, x, x_test, y, y_test):
    print('getting model...voting RandomForest')
    estimators = [(str(idd), RandomForestClassifier()) for idd in range(100)]
    clf = VotingClassifier(estimators=estimators)
    print('training...')
    clf.fit(x, y)
    print('predicting...')
    predicted = clf.predict(x_test)
    print(classification_report(y_test, predicted))
    id = len(os.listdir(dir_models))
    joblib.dump(
        clf, dir_models + ticket + '_voting_rand_forest_' + str(id) + '.pkl')

    return clf.score(x_test, y_test)
def voting_decision_tree(dir_models, ticket, x, x_test, y, y_test):
    print('getting model...Voting decision tree')
    estimators = [(str(idd), DecisionTreeClassifier()) for idd in range(100)]
    clf = VotingClassifier(estimators=estimators, voting='soft')
    print('training...')
    clf.fit(x, y)
    print('predicting...')
    predicted = clf.predict(x_test)
    print(classification_report(y_test, predicted))
    id = len(os.listdir(dir_models))
    joblib.dump(clf,
                dir_models + ticket + '_voting_dectree_' + str(id) + '.pkl')

    return accuracy_score(y_test, predicted)
def bernoulli_classifier(dir_models, ticket, x, x_test, y, y_test):
    print('getting model...BernoulliNB')
    clf = BernoulliNB(binarize=True)

    print('training...')
    clf.fit(x, y)

    print('predicting...')
    predicted = clf.predict(x_test)
    print(classification_report(y_test, predicted))

    id = len(os.listdir(dir_models))
    joblib.dump(clf, dir_models + ticket + '_bernoulli_' + str(id) + '.pkl')
    return clf.score(x_test, y_test)
def gradient_classifier(dir_models, ticket, x, x_test, y, y_test):
    print('getting model...GBC')
    clf = GradientBoostingClassifier(n_estimators=1000)

    print('training...')
    clf.fit(x, y)

    print('predicting...')
    predicted = clf.predict(x_test)
    print(classification_report(y_test, predicted))

    id = len(os.listdir(dir_models))
    joblib.dump(clf, dir_models + ticket + '_gbc_' + str(id) + '.pkl')

    return clf.score(x_test, y_test)
def adaboost_classifier(dir_models, ticket, x, x_test, y, y_test):
    print('getting model...Ada')
    clf = AdaBoostClassifier()

    print('training...')
    clf.fit(x, y)

    print('predicting...')
    predicted = clf.predict(x_test)
    print(classification_report(y_test, predicted))

    id = len(os.listdir(dir_models))
    joblib.dump(clf, dir_models + ticket + '_adaboost_' + str(id) + '.pkl')

    return clf.score(x_test, y_test)
def mlp_classifier(dir_models, ticket, x, x_test, y, y_test):
    print('getting model...MLP')
    clf = MLPClassifier(early_stopping=True)

    print('training...')
    clf.fit(x, y)

    print('predicting...')
    predicted = clf.predict(x_test)
    print(classification_report(y_test, predicted))

    id = len(os.listdir(dir_models))
    joblib.dump(clf, dir_models + ticket + '_mlp_' + str(id) + '.pkl')

    return clf.score(x_test, y_test)
Exemplo n.º 33
0
    def cv_test_module(self, module, module_kwargs, fit_kwargs):
        logging.info('Cross-validate testing module %s...' % module.__name__)

        xs, ys = self.dg.get_all()

        module_object = module(**module_kwargs)
        predict = cross_val_predict(
            module_object,
            xs, ys,
            cv=10,
            n_jobs=-1,
        )

        print(classification_report(ys, predict))
        logging.info('Cross-validate testing module %s finished' % module.__name__)
Exemplo n.º 34
0
def main(args):
    ## tr_data training set
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "base_dir",
        nargs="?",
        default=os.path.join(os.getcwd(), "mseg_workspace"),
        help="this is the working directory, all sub dirs live under it",
    )
    parser.add_argument(
        "pm_dir",
        nargs="?",
        default="pm_default",
        help="this is the directory in which to store the prosodic model file",
    )
    parser.add_argument(
        "training_file",
        nargs="?",
        default=TRAIN_FILE_DEFAULT,
        help="name of CSV file that contains correctly annotated training examples",
    )
    parser.add_argument(
        "test_file",
        nargs="?",
        default=TEST_FILE_DEFAULT,
        help="name of CSV file that contains mysterious cases that must be tested",
    )
    parser.add_argument(
        "-lr",
        "--logistic_regression",
        default=False,
        action="store_true",
        help="use logistic regression classifier (default is RBF-SVM)",
    )
    args = parser.parse_args()

    base_dir = args.base_dir
    pm_dir = args.pm_dir
    tr_file = args.training_file
    test_fname = args.test_file
    use_lr = args.logistic_regression

    #     if(len(args)==3):
    #         base_dir = args[0]
    #         pm_dir = args[1]
    #         tr_file = args[2]
    #         test_fname = args[3]
    #     else:
    #         base_dir = DIR
    #         pm_dir = "pm_default"
    #         tr_file = TRAIN_FILE_DEFAULT
    #         test_fname = TEST_FILE_DEFAULT
    # #         do_search = False
    # #         use_pilot = False
    n_samples = -1
    cache = 800

    #     pm_dir= raw_input("enter PM name: [%s]" % pm_dir) or pm_dir
    #     tr_file = raw_input("enter PM training file name: [%s]" % tr_file) or tr_file

    tr_data = read_file(os.path.join(base_dir, tr_file), ",", skip_header=True)

    #     test_fname = raw_input("enter file to test on: [%s]" % test_fname) or test_fname
    #     use_lr = bool(raw_input("use logistic regression [False]?")) or False

    if not use_lr:
        n_samples = 6000
        out_fname = test_fname + "-probabilities.dat"
        report_fname = test_fname + "-report.txt"
    else:
        out_fname = test_fname + "-probabilities.dat"
        report_fname = test_fname + "-report-LR.txt"

    out_file = os.path.join(base_dir, pm_dir, out_fname)
    report_fname = os.path.join(base_dir, pm_dir, report_fname)
    # clear extant predictions file
    if os.path.exists(out_file):
        os.remove(out_file)
        print "removed", out_file

    print base_dir + "/" + tr_file + " -SVM-> ", out_file

    test_data = read_file(os.path.join(base_dir, test_fname), ",", skip_header=True)

    # sel = [12,13,14,15,21,22,23,24]
    sel = range(7, 30)
    # sel = [8,21,29, 24,25,27]

    (_, _, tr_samples, tr_classes) = dissect(tr_data, sel)
    (_, te_words, te_samples, te_classes) = dissect(test_data, sel)

    if n_samples > 0:
        tr_samples, _, tr_classes, _ = train_test_split(
            tr_samples, tr_classes, train_size=n_samples, stratify=tr_classes
        )

    p = sum(c == 1.0 for c in tr_classes)  # count the positive instances
    n = len(tr_classes) - p  # derive the negative instances
    print "n=", n, " p=", p
    wgt = float(n) / float(p)  # cast and divide
    print "wgt=", wgt
    #     classWeight = { 1: wgt }

    # tr_samples, te_samples, tr_classes, te_classes = train_test_split(samples, classes, test_size=0.20, random_state=0, stratify=classes)

    scaler = preprocessing.StandardScaler().fit(np.array(tr_samples))
    tr_samples = scaler.transform(tr_samples)

    clf = None
    best_params = None
    # override the defaults with the results of a grid search if desired (takes a while)

    # pickled = False
    pkl_dir = os.path.join(base_dir, pm_dir, "pkl")
    pickled_model = os.path.join(pkl_dir, "svm_classifier.pkl")

    if os.path.exists(pickled_model) and not overwrite_pkl:
        clf = joblib.load(pickled_model)
        clf.set_params(verbose=True)
        print "loaded pickled model...", pickled_model

    else:
        if not os.path.exists(pkl_dir):  # output dir doesn't exist so make it
            os.makedirs(pkl_dir)
            print "made dir for pickled model:", pkl_dir

        cmin, cmax, cstep = -5, 17, 2
        cr = range(cmin, cmax, cstep)
        print (cr)
        # c_range = [ pow(2, y) for y in cr]
        # c_range =(0.005, 0.5, 5, 50, 500, 5000, 50000)
        c_range = (0.5, 50, 5000)
        print ("c_range", c_range)

        gmin, gmax, gstep = -15, 5, 2
        gr = range(gmin, gmax, gstep)
        print (gr)
        # gamma_range = [ pow(2, y) for y in gr ]
        # gamma_range = (0.00005, 0.0005, 0.005, 0.05, 0.5, 5.0, 50, 500)
        gamma_range = (0.0005, 0.05, 5.0, 500)

        print ("gamma_range", gamma_range)

        c_dist = scipy.stats.expon(scale=100)
        gamma_dist = scipy.stats.expon(scale=0.01)

        if use_lr:
            estr = LogisticRegression(class_weight="balanced")
            #             estr = LogisticRegression()
            param_dist = {"C": c_dist}
        else:
            estr = svm.SVC(kernel="rbf", cache_size=800, probability=True, class_weight="balanced")
            # estr = svm.LinearSVC(class_weight='balanced')
            param_dist = {"C": c_dist, "gamma": gamma_dist}

        # searcher = RandomizedSearchCV(estr, param_distributions=param_dist, n_iter=100, n_jobs=-1, cv=5, verbose=True ) #, scoring="recall")
        searcher = RandomizedSearchCV(
            estr, param_distributions=param_dist, n_iter=100, n_jobs=-1, verbose=True, scoring="recall"
        )
        searcher.fit(tr_samples, tr_classes)
        report(searcher.grid_scores_)
        clf = searcher.best_estimator_

        print "COMPARING CLF PARAMS WITH BEST PARAMS (shd be same)"
        print clf.get_params()
        print best_params

        joblib.dump(clf, pickled_model)

    print clf

    #     print "FITTING"
    #     clf.set_params(verbose=True)
    #     clf.fit(tr_samples, tr_classes)
    #     print clf

    # NOW TO TEST AGAINST HELD-OUT/TEST DATA
    te_samples = scaler.transform(te_samples)

    print "no test cases", len(te_samples)

    predictions = -1.0 * clf.predict_log_proba(
        te_samples
    )  # this is a list of pairs of probs in form [ [1-p, p],  ... ]
    # predictions = -1.0 * clf.decision_function(te_samples)
    print predictions
    predicted_classes = clf.predict(te_samples)

    print (
        "TEST: Number of mislabelled points out of a total %d points : %d"
        % (len(te_samples), (te_classes != predicted_classes).sum())
    )
    print (classification_report(te_classes, predicted_classes))

    rpt = open(report_fname, "w")
    rpt.write(classification_report(te_classes, predicted_classes))
    rpt.write("\n")
    rpt.close()
    print "wrote report file", rpt

    pred_file = open(out_file, "w")
    pred_file.write("labels 0 1\n")  # this emulates an earlier file format for compatibility
    for word, prob_tuple, guessed_class in zip(te_words, predictions, predicted_classes):
        pred_file.write("%d %f %f %s\n" % (guessed_class, prob_tuple[0], prob_tuple[1], word))

    pred_file.close()
    print "wrote predictions file:", pred_file
    model.add(SimpleRNN(X_train.shape[1], input_dim=X_train.shape[1]))
    model.add(Activation('relu'))
    model.add(SimpleRNN(20000))
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    model.add(SimpleRNN(nb_classes))
    model.add(Activation('softmax'))
    model.compile(loss=loss, optimizer=optim, metrics=['accuracy'])
    return model


classifier = KerasClassifier(build_fn=create_model, nb_epoch=nb_epoch, batch_size=batch_size)
history = classifier.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=nb_epoch)
Y_pred = classifier.predict(X_test, batch_size=batch_size)

print(classification_report(y_true=Y_test, y_pred=Y_pred))

plt.figure()
plt.plot(history.history['acc'])
plt.title('Genauigkeit')
plt.ylabel('Genauigkeit')
plt.xlabel('Epoche')
plt.legend(['Training', 'Test'], loc='upper left')
plt.savefig("data/acc.png")

# summarize history for loss
plt.figure()
plt.plot(history.history['loss'])
plt.title('Loss Werte')
plt.ylabel('Loss')
plt.xlabel('Epoche')
                #                                class_weight={0: 3, 1: 1, 2: 1, 3: 1, 4: 1, 5: 2, 6: 1})

                # model = LogisticRegression(C=subsample, verbose=0, penalty='l1', max_iter=100)
                # model = KNeighborsClassifier(n_neighbors=learning_rate)
                # model = xgb.XGBRegressor(max_depth=depth, n_estimators=n_estimators, learning_rate=learning_rate,
                #                          nthread=1, subsample=subsample, silent=True, colsample_bytree=0.8)
                # model = LinearSVC(C=0.9, penalty='l2', dual=False, verbose=1, max_iter=100000)

                model.fit(trtrfe, trtrtrue)
                # mean accuracy on the given test data and labels
                predicted = [math.floor(x) for x in model.predict(trtefe)]

                score = model.score(trtefe, trtetrue)
                print("score =", score)

                print(classification_report(trtetrue, predicted))
                print(confusion_matrix(trtetrue, predicted))

                if score > best_score or True:
                    best_model = model
                    best_score = score

                    best_model.fit(train_features, train_true)
                    predicted = [math.floor(x) for x in best_model.predict(test_features)]
                    fname = "data/net_result/sol_" + str(score) + "_" + str(time.time()) + ".csv"
                    write_sol(predicted, fname)
                    print("this model", depth, "\t", subsample, "\t", score)
                    print("best model", best_score)

best_model.fit(trtefe, trtetrue)
predicted = [math.floor(x) for x in best_model.predict(test_features)]