accs = sklearn.cross_validation.cross_val_score(
                    ensemble, X, Y, cv = 3)
                acc = np.mean(accs)
                print "CV accuracy %0.4f (std %0.4f)" % \
                    (acc, np.std(accs))
                d['cv_acc'].append(acc)

                aucs = sklearn.cross_validation.cross_val_score(
                    ensemble, X, Y, cv = 5, scoring='roc_auc')
                auc = np.mean(aucs)
                print "CV AUC %0.4f (std %0.4f)" % \
                    (auc, np.std(aucs))
                d['cv_auc'].append(auc)

                ensemble.fit(X, Y)

                X_pos_test = vectorizer.transform(cancer_peptides)
                Y_pos_pred = ensemble.predict(X_pos_test)
                pos_acc = np.mean(Y_pos_pred)
                print "Tumor antigen accuracy %0.4f" % (pos_acc,)
                d['pos_acc'].append(pos_acc)

                X_neg_test = vectorizer.transform(
                    non_immunogenic_hiv_peptides)
                Y_neg_pred = ensemble.predict(X_neg_test)
                neg_acc = 1.0 - np.mean(Y_neg_pred)
                print "Non-immunogenic accuracy %0.4f" % (neg_acc,)
                d['neg_acc'].append(neg_acc)

                n_pos_pred = np.sum(Y_pos_pred)
                    X -= ord('0')
                    return X

                X = strings_to_array(X_combined)
                Y = np.array(Y_combined)
                W = np.array(W_combined)
                print "# imm = %d, # non = %d" % (len(imm), len(non))
                print "Data shape", X.shape, "n_true", np.sum(Y)
                
                rf = BalancedEnsembleClassifier(n_estimators = 200)
                #aucs = sklearn.cross_validation.cross_val_score(
	        #  rf, X, Y, cv = 10, scoring='roc_auc')
		#print "CV AUC %0.4f (std %0.4f)" % (np.mean(aucs), np.std(aucs))
                #d['cv_auc'].append(np.mean(aucs))
                #rf = RandomForestClassifier(n_estimators = 100)
                rf.fit(X, Y, W)
                def predict(peptides):
                    Y_pred = np.zeros(len(peptides), dtype=float)
                    counts = np.zeros(len(peptides), dtype=int)
                    X_test, _, Indices = expand(peptides)
                    X_test = strings_to_array(X_test)
                    #Y_pred_raw = rf.predict(X_test)

                    Y_pred_prob = rf.predict_proba(X_test)[:, 1]
                    Y_pred_rescaled = (2 * (Y_pred_prob - 0.5))
                    Y_pred_weight = np.sign(Y_pred_rescaled) * Y_pred_rescaled ** 2
                    # group outputs by the sample they came from, 
                    # at the end we'll have the majority vote 
                    #Y_pred = rf.predict(X_test)
                    for (y,i) in zip(Y_pred_weight, Indices):
                        Y_pred[i] += y