예제 #1
0
def plot_conf_matrix(y_true, y_pred, normed=True, heatmap_color ='Blues', **kwargs):

    ## check to make sure that y_pred is an array of integers if y_true is a bunch of integers
    true_int_check = all(isinstance(a,int) for a in y_true)
    pred_int_check = all(isinstance(a,int) for a in y_pred)
    if true_int_check and not pred_int_check: # convert the y_pred values to integers
        if isinstance(y_pred, pd.Series):
            y_pred = y_pred.astype(int)

    my_c = metrics.confusion_matrix(y_true, y_pred)

    print metrics.matthews_corrcoef(y_true, y_pred)
    if normed:
        cm_normalized = my_c.astype('float') / my_c.sum(axis=1)[:, np.newaxis]
        my_c = cm_normalized
        plt.title('Normalized RF Classifier Confusion Matrix')
    else:
        plt.title('Random Forest Classifier Confusion Matrix')

    sns.heatmap(my_c, annot=True,  fmt='',cmap=heatmap_color, **kwargs)
    plt.ylabel('True')
    plt.xlabel('Assigned')
    plt.show()

    return
예제 #2
0
def melodiness_metrics(m_train, m_test, y_train, y_test):
    """ Compute metrics on melodiness score

    Parameters
    ----------
    m_train : np.array [n_samples]
        melodiness scores for training set
    m_test : np.array [n_samples]
        melodiness scores for testing set
    y_train : np.array [n_samples]
        Training labels.
    y_test : np.array [n_samples]
        Testing labels.

    Returns
    -------
    melodiness_scores : dict
        melodiness scores for training set
    """
    m_bin_train = 1*(m_train >= 1)
    m_bin_test = 1*(m_test >= 1)

    train_scores = {}
    test_scores = {}

    train_scores['accuracy'] = metrics.accuracy_score(y_train, m_bin_train)
    test_scores['accuracy'] = metrics.accuracy_score(y_test, m_bin_test)

    train_scores['mcc'] = metrics.matthews_corrcoef(y_train, m_bin_train)
    test_scores['mcc'] = metrics.matthews_corrcoef(y_test, m_bin_test)

    (p, r, f, s) = metrics.precision_recall_fscore_support(y_train,
                                                           m_bin_train)
    train_scores['precision'] = p
    train_scores['recall'] = r
    train_scores['f1'] = f
    train_scores['support'] = s

    (p, r, f, s) = metrics.precision_recall_fscore_support(y_test,
                                                           m_bin_test)
    test_scores['precision'] = p
    test_scores['recall'] = r
    test_scores['f1'] = f
    test_scores['support'] = s

    train_scores['confusion matrix'] = \
        metrics.confusion_matrix(y_train, m_bin_train, labels=[0, 1])
    test_scores['confusion matrix'] = \
        metrics.confusion_matrix(y_test, m_bin_test, labels=[0, 1])

    train_scores['auc score'] = \
        metrics.roc_auc_score(y_train, m_train + 1, average='weighted')
    test_scores['auc score'] = \
        metrics.roc_auc_score(y_test, m_test + 1, average='weighted')

    melodiness_scores = {'train': train_scores, 'test': test_scores}

    return melodiness_scores
예제 #3
0
def clf_metrics(p_train, p_test, y_train, y_test):
    """ Compute metrics on classifier predictions

    Parameters
    ----------
    p_train : np.array [n_samples]
        predicted probabilities for training set
    p_test : np.array [n_samples]
        predicted probabilities for testing set
    y_train : np.array [n_samples]
        Training labels.
    y_test : np.array [n_samples]
        Testing labels.

    Returns
    -------
    clf_scores : dict
        classifier scores for training set
    """
    y_pred_train = 1*(p_train >= 0.5)
    y_pred_test = 1*(p_test >= 0.5)

    train_scores = {}
    test_scores = {}

    train_scores['accuracy'] = metrics.accuracy_score(y_train, y_pred_train)
    test_scores['accuracy'] = metrics.accuracy_score(y_test, y_pred_test)

    train_scores['mcc'] = metrics.matthews_corrcoef(y_train, y_pred_train)
    test_scores['mcc'] = metrics.matthews_corrcoef(y_test, y_pred_test)

    (p, r, f, s) = metrics.precision_recall_fscore_support(y_train,
                                                           y_pred_train)
    train_scores['precision'] = p
    train_scores['recall'] = r
    train_scores['f1'] = f
    train_scores['support'] = s

    (p, r, f, s) = metrics.precision_recall_fscore_support(y_test,
                                                           y_pred_test)
    test_scores['precision'] = p
    test_scores['recall'] = r
    test_scores['f1'] = f
    test_scores['support'] = s

    train_scores['confusion matrix'] = \
        metrics.confusion_matrix(y_train, y_pred_train, labels=[0, 1])
    test_scores['confusion matrix'] = \
        metrics.confusion_matrix(y_test, y_pred_test, labels=[0, 1])

    train_scores['auc score'] = \
        metrics.roc_auc_score(y_train, p_train + 1, average='weighted')
    test_scores['auc score'] = \
        metrics.roc_auc_score(y_test, p_test + 1, average='weighted')

    clf_scores = {'train': train_scores, 'test': test_scores}

    return clf_scores
예제 #4
0
	def printAnalysis(self,true_pred,y_pred1):

		print "########## Analysing the Model result ##########################"

		math_corr = matthews_corrcoef( true_pred,y_pred1)
		roc_auc = roc_auc_score( true_pred,y_pred1)

		print(classification_report( true_pred,y_pred1))
		print("Matthews correlation :" + str(matthews_corrcoef( true_pred,y_pred1)))
		print("ROC AUC score :" + str(roc_auc_score( true_pred,y_pred1)))
예제 #5
0
def _show_classification_results(y_test, y_pred):
    """ Prints performance metrics for a classifier """

    print metrics.classification_report(y_test, y_pred)
    print
    print 'Confusion matrix:'
    print metrics.confusion_matrix(y_test, y_pred)
    print
    print 'Matthew\'s correlation coefficient:',
    print metrics.matthews_corrcoef(y_test, y_pred)
    print 'F1 score:',
    print metrics.f1_score(y_test, y_pred)
    print
def score_MCC(ground_truth, scores):
    '''
    assuming the model output is the probability of being default,
    then this probability can be used for ranking. Then using the fraction of
    default in validation data to assign the proper threshold to the prediction
    '''

    if isinstance(scores, pd.Series):
        scores = scores.values

    if isinstance(ground_truth, pd.Series):
        ground_truth = ground_truth.values

    tmp_ground_truth = np.copy(ground_truth)
    fault_frac = tmp_ground_truth.mean()
    #print 'score shape:', scores.shape, 
    print 'mean of groud truth:', fault_frac
    thres_value = np.percentile(scores, 100.*(1-fault_frac), axis=0)
    print 'threshold for preds:', thres_value
    binary_scores = scores > thres_value
    binary_scores = binary_scores.astype(int)
    ## convert to sk-learn format
    np.place(binary_scores, binary_scores==0, -1)
    np.place(tmp_ground_truth, tmp_ground_truth==0, -1)

    return matthews_corrcoef(tmp_ground_truth, binary_scores)
예제 #7
0
    def KFold_method(self):
        
        kf = KFold(n_splits=10)
        for train_index, test_index in kf.split(self.FeatureSet):
            X_train = []
            X_test = []
            y_train = []
            y_test = []
            for trainid in train_index.tolist():
                X_train.append(self.FeatureSet[trainid])
                y_train.append(self.Label[trainid])

            for testid in test_index.tolist():
                X_test.append(self.FeatureSet[testid])
                y_test.append(self.Label[testid])
            #clf = tree.DecisionTreeClassifier()        
            #clf = clf.fit(X_train, y_train)
            #pre_labels = clf.predict(X_test)
            clf = AdaBoostClassifier(n_estimators=100)
            clf = clf.fit(X_train, y_train)
            pre_labels = clf.predict(X_test)
            # Modeal Evaluation
            ACC = metrics.accuracy_score(y_test, pre_labels)
            MCC = metrics.matthews_corrcoef(y_test, pre_labels)
            SN = self.performance(y_test, pre_labels)
            print ACC, SN
예제 #8
0
    def Bootstrap_method(self):
        rs = cross_validation.ShuffleSplit(
            len(self.FeatureSet), 10, 0.25, random_state=0)
        clf = tree.DecisionTreeClassifier()
        for train_index, test_index in rs:
            X_train = []
            X_test = []
            y_train = []
            y_test = []
            for trainid in train_index.tolist():
                X_train.append(self.FeatureSet[trainid])
                y_train.append(self.Label[trainid])

            for testid in test_index.tolist():
                X_test.append(self.FeatureSet[testid])
                y_test.append(self.Label[testid])

            #clf = clf.fit(X_train, y_train)
           # pre_labels = clf.predict(X_test)
            clf = AdaBoostClassifier(n_estimators=100)
            clf = clf.fit(X_train, y_train)
            pre_labels = clf.predict(X_test)
            # Modeal Evaluation
            ACC = metrics.accuracy_score(y_test, pre_labels)
            MCC = metrics.matthews_corrcoef(y_test, pre_labels)
            SN = self.performance(y_test, pre_labels)
            print ACC,SN
예제 #9
0
def compute_MCC(y_true, y_score, threshold_num=500):
    """Compute the Matthews Correlation Coefficient.
    
    :param y_true: true binary labels in range {0, 1}
    :type y_true: numpy array
    :param y_score: the probability estimates of the positive class
    :type y_score: numpy array
    :param threshold_num: the number of thresholds
    :type threshold_num: int
    :return: the maximum Matthews Correlation Coefficient
    :rtype: float
    """

    # Get the ranks
    ranks = get_ranks(y_score)

    # Generate the array which contains the value of thresholds
    threshold_array = np.linspace(np.min(ranks) - 1, np.max(ranks) + 1, num=threshold_num)

    # Generate MCC values
    MCC_list = []
    for threshold in threshold_array:
        MCC_list.append(matthews_corrcoef(y_true, ranks > threshold))
    MCC_array = np.array(MCC_list)

    # Illustrate threshold and MCC values
    # pylab.figure()
    # pylab.plot(threshold_array / np.max(ranks), MCC_array)
    # pylab.show()

    return np.max(MCC_array)
예제 #10
0
    def evalmetric(pred, truth):
        return 'auc_mine', metrics.roc_auc_score(truth.get_label(), pred)

        thresholds =  np.arange(99.6, 99.9, 0.025)
        bestScore =  0
        bestT = 0
        bestAcc = 0
        bestCf = np.zeros((2,2))

        thresholds = [0.10]
        for t in thresholds:
            temp = np.copy(pred)
            temp[np.where(pred > np.percentile(pred, t))] = 1
            temp[np.where(pred <= np.percentile(pred, t))] = 0
            score = metrics.matthews_corrcoef(truth.get_label(), temp)
            

            if score > bestScore:
                bestScore = score
                bestT = np.percentile(pred, t)
                bestAuc = metrics.roc_auc_score(truth.get_label(), temp, reorder=True)
                bestCf = metrics.confusion_matrix(truth.get_label(), temp)

        
        print('threshold {} mcc {} auc {} TN {} FP {} FN {} TP {}\n'.format(bestT, bestScore, bestAcc, bestCf[0][0], bestCf[0][1], bestCf[1][0], bestCf[1][1]))

        return 'mcc', -1 * bestScore
예제 #11
0
def calcualte_threshold(positives, negatives, measure="SPC", measure_threshold=0.95, thresholds=None, attempt=0):
    """Plot the TPR the FPR vs threshold values
 
    Input:
    postives - list of scores of postive runs
    negatives - list of scores of negative runs
    measure - choose coffectiong by 95% Specificity ("SPC"), or matthews_corrcoef ("MCC")
    """
    assert measure in ["TPR", "FPR", "SPC", "MCC", "PPV", "FDR", "ACC"]
    y_true = [1]*len(positives)+[0]*len(negatives)
    values = {name:[] for name in ["TPR", "FPR", "SPC", "MCC", "PPV", "FDR", "ACC"]}
    saveThreshold = None
    saveValue = 1.0
    thresholds = list(sorted(thresholds or map(lambda i: i/10., xrange(1,10000))))

    for threshold in thresholds:
        TN = sum([1 for score in negatives if score < threshold])
        FP = sum([1 for score in negatives if score >= threshold])
        TP = sum([1 for score in positives if score >= threshold])
        FN = sum([1 for score in positives if score < threshold])

        values["FPR"].append(float(FP)/(FP+TN))
        values["TPR"].append(float(TP)/(TP+FN))
        values["SPC"].append(float(TN)/(FP+TN))

        y_pred = [int(score >= threshold) for scores in (positives, negatives) for score in scores]
        values["MCC"].append(matthews_corrcoef(y_true, y_pred))
        values["PPV"].append(float(TP)/(TP+FP) if TP+FP>0 else 0.0)
        values["FDR"].append(float(FP)/(TP+FP) if TP+FP>0 else 0.0)
        values["ACC"].append(float(TP+TN)/(len(positives)+len(negatives)))
        
    specificity_curve_inverse = interp1d(values[measure], thresholds)
    saveThreshold = specificity_curve_inverse(measure_threshold) #modified by Alexey 0.95 => measure_threshold
                
    return saveThreshold, thresholds, values
예제 #12
0
def eval_mcc(y_true, y_prob, show=False):
    idx = np.argsort(y_prob)
    y_true = np.array(y_true, dtype=int)
    y_true_sort = y_true[idx]
    n = y_true.shape[0]
    nump = 1.0 * np.sum(y_true)  # number of positive
    numn = n - nump  # number of negative
    tp = nump
    tn = 0.0
    fp = numn
    fn = 0.0
    best_mcc = 0.0
    best_id = -1
    mccs = np.zeros(n)
    for i in range(n):
        if y_true_sort[i] == 1:
            tp -= 1.0
            fn += 1.0
        else:
            fp -= 1.0
            tn += 1.0
        new_mcc = mcc(tp, tn, fp, fn)
        mccs[i] = new_mcc
        if new_mcc >= best_mcc:
            best_mcc = new_mcc
            best_id = i
    best_proba = y_prob[idx[best_id]]
    y_pred = (y_prob > best_proba).astype(int)
    final_mcc = matthews_corrcoef(y_true, y_pred)
    if show:
        return best_proba, final_mcc, y_pred
    else:
        return final_mcc
    def fit_knn(config):
        ### Prepare result holders ###b
        values = {}
        results = {}
        monitors = {}
        E = {"config": config, "results": results, "monitors":monitors, "values":values}

        ### Print experiment header ###
        print_exp_name(config)

        ### Train ###
        monitors["acc_fold"] = []
        monitors["mcc_fold"] = []
        monitors["wac_fold"] = []
        monitors["cm"] = [] # confusion matrix

        results["mean_acc"] = 0
        results["mean_mcc"] = 0


        values["mean_cls"] = Y.mean()
        values["transformers"] = []

        for fold in D["folds"]:
            if config["use_embedding"] == 0:
                tr_id, ts_id = fold["train_id"], fold["test_id"]
                X_train, Y_train, X_test, Y_test = X[tr_id].todense(), Y[tr_id], X[ts_id].todense(), Y[ts_id]
            else:
                X_train, Y_train, X_test, Y_test = fold["X_train"], fold["Y_train"], fold["X_test"], fold["Y_test"]

            if config["use_embedding"] == 0:
                clf = KNeighborsClassifier(n_neighbors=config["KNN_K"], metric="jaccard")
                clf.fit(X_train, Y_train)
                Y_pred = clf.predict(X_test)
            else: # Looking at the similarity of the closest example and getting K=1 from arbitrary K :)
                Y_pred = []
                for x in X_test:
                    Y_pred.append(1 if x[-4] > x[-2] else -1)
                Y_pred = np.array(Y_pred)

            acc_fold, mcc_fold = accuracy_score(Y_test, Y_pred), matthews_corrcoef(Y_test, Y_pred)
            cm = confusion_matrix(Y_test, Y_pred)
            tp, fn, fp, tn = cm[1,1], cm[1,0], cm[0,1], cm[0,0]

            monitors["cm"].append(cm)
            monitors["wac_fold"].append(0.5*tp/float(tp+fn) + 0.5*tn/float(tn+fp))
            monitors["acc_fold"].append(acc_fold)
            monitors["mcc_fold"].append(mcc_fold)

        monitors["acc_fold"] = np.array(monitors["acc_fold"])
        monitors["mcc_fold"] = np.array(monitors["mcc_fold"])
        monitors["wac_fold"] = np.array(monitors["wac_fold"])

        results["mean_acc"] = monitors["acc_fold"].mean()
        results["mean_mcc"] = monitors["mcc_fold"].mean()
        results["mean_wac"] = monitors["wac_fold"].mean()

        logger.info(results)

        return E
예제 #14
0
    def run(self):
        with LpcApocResultTask(self.tname,
                               self.qname,
                               self.subset).output().open('r') as f:
            apoc_parser = ApocResultParer(f.read())

        f = self.output().open('w')

        data = {}
        data['tname'] = self.tname
        data['qname'] = self.qname
        data['t pocket'] = LpcPocketPathTask(self.tname).output().path
        data['q pocket'] = LpcPocketPathTask(self.qname).output().path
        data['Apoc result'] = LpcApocResultTask(self.tname, self.qname, self.subset).output().path
        data['Kcombu result'] = LpcKcombuResult(self.tname, self.qname, self.subset).output().path

        kcombu_data = self._kcombu_results().data
        data['Kcombu tanimoto'] = kcombu_data.tanimoto

        t_coords, q_coords = self._select_ligand_atom_coords()

        global_alignment = apoc_parser.queryGlobal(self.tname, self.qname)
        data['seq identity'] = global_alignment.seq_identity

        pocket_alignment = apoc_parser.queryPocket(self.tname, self.qname)
        if pocket_alignment.has_pocket_alignment:
            t_prt_coords, t_prt_names = self._select_residues(self.tname,
                                                              pocket_alignment.template_chainid,
                                                              pocket_alignment.template_res)

            q_prt_coords, q_prt_names = self._select_residues(self.qname,
                                                              pocket_alignment.query_chainid,
                                                              pocket_alignment.query_res)

            try:
                assert(t_prt_names == q_prt_names)
            except AssertionError:
                raise AssertionError("%s and %s protein residues do not match" % (self.tname, self.qname))

            t_contact = buildArrayOfContact(t_prt_coords, t_coords)
            q_contact = buildArrayOfContact(q_prt_coords, q_coords)
            cms = matthews_corrcoef(t_contact, q_contact)

            data['# residues'] = len(pocket_alignment.template_res)
            data['# ligand atoms'] = len(t_coords)
            data['Apoc ps-score'] = pocket_alignment.ps_score
            data['Apoc p-value'] = pocket_alignment.p_value
            data['Apoc z-score'] = pocket_alignment.z_score
            data['# residue atoms'] = len(t_prt_coords)
            data['t contact'] = t_contact
            data['q contact'] = q_contact
            data['xcms'] = cms

        to_write = json.dumps(data, sort_keys=True, indent=4, separators=(',', ': '))
        f.write(to_write)
        f.close()

        print "xcms output %s" % (self.output().path)
예제 #15
0
def svc_amino(X, y, score_type):
    """

    :param X:
    :param y:
    :param score_type:
    """
    

    if (score_type=="split"):
        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

        C = 70  # SVM regularization parameter
        rbf_svc = svm.SVC(kernel='rbf', gamma=0.07, C=C)
        rbf_svc.fit(X_train, y_train)
        y_score = np.array(rbf_svc.predict(X_test))
        y_test = np.array(y_test)
        tn = 0.0
        fp = 0.0

        for i in range(y_score.shape[0]):
            if y_test[i]==-1:
                if y_score[i]==-1:
                    tn = tn+1
                else: fp = fp+1
        spec = tn/(tn+fp)
        print "sensitivity:"
        print recall_score(y_test,y_score)
        print "specificity:"
        print spec
        print "accuracy:"
        print accuracy_score(y_test,y_score)
        print "MCC:"
        print matthews_corrcoef(y_test,y_score)

        


        return "ciao"
        #con binary fa 0.78 con pssm fa 0.80
    else:
        if(score_type=="cross"):
            scores = cross_validation.cross_val_score(rbf_svc, X, np.array(y), cv=5)
            return scores
예제 #16
0
def consistent_bfs(adjacency, edge_signs, root):
    """Return a set of edges forming a tree rooted at `root` in which all
    internal path have no more than one negative edges. Also compute its score
    based on internal edges not part of the tree and outgoing edges."""
    tree = set()
    q = deque()
    discovered = {k: (False, 0, 0) for k in adjacency.keys()}
    q.append(root)
    discovered[root] = (True, 0, 0)
    tree_nodes = set()
    nb_iter = 0
    total_path_length, nb_paths = 0, 0
    while q and nb_iter < len(discovered):
        nb_iter += 1
        v = q.popleft()
        tree_nodes.add(v)
        negativity = discovered[v][1]
        dst_from_root = discovered[v][2]
        for w in adjacency[v]:
            if not discovered[w][0]:
                e = (v, w) if v < w else (w, v)
                sign = edge_signs[e]
                w_negativity = negativity + {False: 1, True: 0}[sign]
                discovered[w] = (True, w_negativity, dst_from_root+1)
                if w_negativity <= 1:
                    q.append(w)
                    tree.add(e)
                else:
                    total_path_length += dst_from_root
                    nb_paths += 1
    within_tree, outside_edges, one_neg_edges = 0, 0, 0
    gold, pred = [], []
    for node in tree_nodes:
        negativity = discovered[node][1]
        for endpoint in adjacency[node]:
            if endpoint < node:
                continue
            e = (node, endpoint)
            if endpoint in tree_nodes:
                if e not in tree:
                    within_tree += 1
                    number_of_neg_edges = discovered[endpoint][1] + negativity
                    bad = number_of_neg_edges > 1
                    one_neg_edges += 1 - int(bad)
                    if not bad:
                        pred.append(1-number_of_neg_edges)
                        gold.append(int(edge_signs[e]))
            else:
                outside_edges += 1
    matthews_score = -1
    if len(gold) > 0:
        matthews_score = matthews_corrcoef(gold, pred)
    if within_tree == 0 or nb_paths == 0:
        return (root, -5, -5, -5, -5)
    return (root, outside_edges/len(tree_nodes), one_neg_edges/within_tree,
            matthews_score, total_path_length/nb_paths)
def f1_score(preds, true_labels):
    mcc = matthews_corrcoef(preds, true_labels)
    print "mcc ", mcc
    kappa = cohen_kappa_score(preds, true_labels)
    print "kappa ", kappa
    p = precision(preds, true_labels)
    print "precision ", p
    r = recall(preds, true_labels)
    print "recall", r
    return 2*p*r/(p+r)
예제 #18
0
파일: treestar.py 프로젝트: daureg/magnet
def full_treestar(G, E, k):
    root = max(G.items(), key=lambda x: len(x[1]))[0]
    start = clock()
    (gold, pred), m = treestar(G, E, k, root)
    end = clock() - start
    C = confusion_matrix(gold, pred)
    fp, tn = C[0, 1], C[0, 0]
    return [accuracy_score(gold, pred),
            f1_score(gold, pred, average='weighted', pos_label=None),
            matthews_corrcoef(gold, pred), fp/(fp+tn), end, 1-len(pred)/m]
예제 #19
0
def compute_prediction_galaxy(k, edge_signs, seed=None):
    global NUM_TRAIN_EDGES
    basename = BASENAME+'_{}_{}'.format(seed, k)
    spanner_edges, _, _, _ = pot.read_spanner_from_file(basename)
    train_edges = {(u, v) for u, v in spanner_edges}
    NUM_TRAIN_EDGES = len(train_edges)
    gold, pred, brute_pred = pot.predict_edges(basename,
                                               all_signs=edge_signs,
                                               use_brute=False)
    return (accuracy_score(gold, pred), f1_score(gold, pred),
            matthews_corrcoef(gold, pred))
예제 #20
0
    def solve_for_pq(x0, method='L-BFGS-B', bounds=bounds):
        sstart = clock()
        res = minimize(cost_and_grad, x0, jac=True, bounds=bounds,
                       method=method, options=dict(maxiter=1500))
        x = res.x
        p, q, y = x[:n], x[n:n*2], x[2*n:]
        feats = p[ppf]+q[qpf]
        pred = feats[test_set] > -find_threshold(-feats[train_set], ya[train_set])
        time_elapsed = clock() - sstart
        mcc = matthews_corrcoef(gold, pred)

        sorted_y = np.sort(y[test_set])
        frac = 1-ya[train_set].sum()/train_set.size
        pred_y_frac = y[test_set] > sorted_y[int(frac*sorted_y.size)]
        mcc_y_frac = matthews_corrcoef(gold, pred_y_frac)

        pred_y_fixed = y[test_set] > 0.5
        mcc_y_fixed = matthews_corrcoef(gold, pred_y_fixed)
        cost = res.fun
        return mcc, cost, mcc_y_fixed, mcc_y_frac, time_elapsed, x
예제 #21
0
def rbf_analysis(X, Y, c, g, title, filename):

	print "Performing Cross Validation on Penalty: {}".format(c)
	dataLength = len(X)
	loo = LeaveOneOut(dataLength)
	predictions = []
	expected = []
	TP, FN, TN, FP = 0, 0, 0, 0
	Accuracy = 0
	for train_index, test_index in loo:
		X_train, X_test = X[train_index], X[test_index]
		Y_train, Y_test = Y[train_index], Y[test_index][0]

		clf = SVC(C=c, gamma=g, kernel='rbf')
		clf.fit(X_train, Y_train)
		prediction = clf.predict(X_test)[0]
	
		predictions.append(prediction)
		expected.append(Y_test)

	print("Calculating.....")
	for i, prediction in enumerate(predictions):
		if(prediction == 1 and expected[i] == 1):
			TP += 1
		elif(prediction == 0 and expected[i] == 1):
			FN += 1
		elif(prediction == 0 and expected[i] == 0):
			TN += 1
		elif(prediction == 1 and expected[i] == 0):
			FP += 1
		else:
			pass

	Sensitivity = TP/float(TP + FN)
	Specificity = TN/float(TN + FP)
	Accuracy = (TP + TN)/float(TP + TN + FP + FN)

	# Saving data to file
	with open(filename, 'ab') as f:
		f.write("Sensitivity of Prediction: {} @ Penalty: {} @ Gamma: {}\n".format(Sensitivity, c, g))
		f.write("Specificity of Prediction: {} @ Penalty: {} @ Gamma: {}\n".format(Specificity, c, g))
		f.write("Accuracy of Prediction: {} @ Penalty: {} @ Gamma: {}\n".format(Accuracy, c, g))
		f.write("Matthews Correlation Coeefficient Value: {}\n".format(matthews_corrcoef(predictions, expected)))
		f.write("Classification Report:\n")
		f.write(classification_report(predictions, expected))
		f.write("Confusion Matrix\n")
		cm = confusion_matrix(predictions, expected)
		f.write(str(cm))
		cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
		label1 = "Negative"
		label2 = "Positive"
			
		plt.figure()
		plot_confusion_matrix(cm, title, label1, label2)
예제 #22
0
    def test(y_true, y_pred):
        cm = confusion_matrix(y_true, y_pred)
        assert_array_equal(cm, [[22, 3], [8, 17]])

        tp, fp, fn, tn = cm.flatten()
        num = (tp * tn - fp * fn)
        den = np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))

        true_mcc = 0 if den == 0 else num / den
        mcc = matthews_corrcoef(y_true, y_pred)
        assert_array_almost_equal(mcc, true_mcc, decimal=2)
        assert_array_almost_equal(mcc, 0.57, decimal=2)
def predict(clf):
    X, Y = build_training_and_test_data()
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=42)

    clf.fit(X_train, y_train)

    y_true, y_pred = y_test, clf.predict(X_test)
    print("matthews correlation co-efficient: {0}".format(matthews_corrcoef(y_true, y_pred)))
    print(classification_report(y_true, y_pred))
    plt.figure()
    plot_confusion_matrix(confusion_matrix(y_test, y_pred), ["rejected", "posted"], title="decision tree")
    plt.show()
예제 #24
0
파일: evaluate.py 프로젝트: arose/deepchem
  def compute_model_performance(self, csv_out, stats_file):
    """
    Computes statistics of model on test data and saves results to csv.
    """
    pred_y_df = self.model.predict(self.dataset)
    log("Saving predictions to %s" % csv_out, self.verbose)
    pred_y_df.to_csv(csv_out)

    if self.task_type == "classification":
      colnames = ["task_name", "roc_auc_score", "matthews_corrcoef",
                  "recall_score", "accuracy_score"]
    elif self.task_type == "regression":
      colnames = ["task_name", "r2_score", "rms_error"]
    else:
      raise ValueError("Unrecognized task type: %s" % self.task_type)

    performance_df = pd.DataFrame(columns=colnames)
    y_means = pred_y_df.iterrows().next()[1]["y_means"]
    y_stds = pred_y_df.iterrows().next()[1]["y_stds"]

    for i, task_name in enumerate(self.task_names):
      y = pred_y_df[task_name].values
      y_pred = pred_y_df["%s_pred" % task_name].values
      w = pred_y_df["%s_weight" % task_name].values
      y = undo_transform(y, y_means, y_stds, self.output_transforms)
      y_pred = undo_transform(y_pred, y_means, y_stds, self.output_transforms)

      if self.task_type == "classification":
        y, y_pred = y[w.nonzero()].astype(int), y_pred[w.nonzero()].astype(int)
        # Sometimes all samples have zero weight. In this case, continue.
        if not len(y):
          continue
        auc = compute_roc_auc_scores(y, y_pred)
        mcc = matthews_corrcoef(y, y_pred)
        recall = recall_score(y, y_pred)
        accuracy = accuracy_score(y, y_pred)
        performance_df.loc[i] = [task_name, auc, mcc, recall, accuracy]

      elif self.task_type == "regression":
        try:
          r2s = r2_score(y, y_pred)
          rms = np.sqrt(mean_squared_error(y, y_pred))
        except ValueError:
          r2s = np.nan
          rms = np.nan
        performance_df.loc[i] = [task_name, r2s, rms]

    log("Saving model performance scores to %s" % stats_file, self.verbose)
    performance_df.to_csv(stats_file)

    return pred_y_df, performance_df
예제 #25
0
 def compute_mcc(self, x, y, labels):
   for column in x.T:
     pred_lab = column.todense().reshape([len(y), 1])
     pred_lab[np.where(pred_lab > 0)] = 1
     mcc_word = []
     for label in labels:
       true_lab = np.empty([len(y), 1])
       true_lab.fill(0)
       true_lab[np.where(y == label)] = 1
       mcc = matthews_corrcoef(true_lab, pred_lab)
       mcc_word.append(mcc.item())
     self.mcc_all_words.append(mcc_word)
   self.mcc_all_words = np.array(self.mcc_all_words, dtype=np.float32)
   self.mcc_all_words[np.where(self.mcc_all_words<0)] = 0
예제 #26
0
    def forward(self, task=None, input1=None, input2=None, label=None):
        '''
        Predict through model and task-specific prediction layer

        Args:
            - inputs (tuple(TODO))
            - pred_layer (nn.Module)
            - pair_input (int)

        Returns:
            - logits (TODO)
        '''
        pair_input = task.pair_input
        pred_layer = getattr(self, '%s_pred_layer' % task.name)
        if pair_input:
            if self.pair_enc_type == 'bow':
                sent1 = self.sent_encoder(input1)
                sent2 = self.sent_encoder(input2) # causes a bug with BiDAF
                logits = pred_layer(torch.cat([sent1, sent2, torch.abs(sent1 - sent2),
                                               sent1 * sent2], 1))
            else:
                pair_emb = self.pair_encoder(input1, input2)
                logits = pred_layer(pair_emb)

        else:
            sent_emb = self.sent_encoder(input1)
            logits = pred_layer(sent_emb)
        out = {'logits': logits}
        if label is not None:
            if isinstance(task, (STS14Task, STSBTask)):
                loss = F.mse_loss(logits, label)
                label = label.squeeze(-1).data.cpu().numpy()
                logits = logits.squeeze(-1).data.cpu().numpy()
                task.scorer1(pearsonr(logits, label)[0])
                task.scorer2(spearmanr(logits, label)[0])
            elif isinstance(task, CoLATask):
                label = label.squeeze(-1)
                loss = F.cross_entropy(logits, label)
                task.scorer2(logits, label)
                label = label.data.cpu().numpy()
                _, preds = logits.max(dim=1)
                task.scorer1(matthews_corrcoef(label, preds.data.cpu().numpy()))
            else:
                label = label.squeeze(-1)
                loss = F.cross_entropy(logits, label)
                task.scorer1(logits, label)
                if task.scorer2 is not None:
                    task.scorer2(logits, label)
            out['loss'] = loss
        return out
예제 #27
0
def evaluate(gold_file, pred_file, metrics=['acc'], skip_gold=1, skip_pred=1, gold_map=None):
    golds = []
    preds = []
    with open(gold_file) as gold_fh:
        for _ in range(skip_gold):
            gold_fh.readline()
        for row in gold_fh:
            targ = row.strip().split('\t')[-1]
            try:
                targ = int(targ)
            except:
                pass
            '''
            try:
                targ = float(targ)
            except:
                pass
            '''
            if gold_map is not None:
                targ = gold_map[targ]
            golds.append(targ)

    with open(pred_file) as pred_fh:
        for _ in range(skip_pred):
            pred_fh.readline()
        for row in pred_fh:
            targ = row.strip().split('\t')[-1]
            try:
                targ = int(targ)
            except:
                pass
            preds.append(targ)

    assert len(golds) == len(preds)
    n_exs = len(golds)
    if 'acc' in metrics:
        acc = sum([1 for gold, pred in zip(golds, preds) if gold == pred]) / float(len(golds))
        print("acc: %.3f" % acc)
    if 'f1' in metrics:
        f1 = f1_score(golds, preds)
        print("f1: %.3f" % f1)
    if 'matthews' in metrics:
        mcc = matthews_corrcoef(golds, preds)
        print("mcc: %.3f" % mcc)
    if "corr" in metrics:
        corr = pearsonr(golds, preds)[0]
        print("pearson r: %.3f" % corr)
        corr = spearmanr(golds, preds)[0]
        print("spearman r: %.3f" % corr)
예제 #28
0
def get_scores(scores,y,label=None, verbose=True):
    '''
    Returns a dictionary of metrics for a given classification of the data (given by Cross_val_predict).
    scores: list
        Classifier predictions on data
    y: list
        True Class labels
    label: string
        Name of the classifier used
    '''
    results_dict = {}
    try:
        roc_auc_no_avg = metrics.roc_auc_score(y, scores,average=None)
        if verbose: print("roc_auc for each class: %0.4f " % (roc_auc_no_avg))
        results_dict['ROC_AUC not averaged'] = roc_auc_no_avg
    
        roc_auc_weighted = metrics.roc_auc_score(y, scores,average='weighted')
        if verbose: print("roc_auc (weighted-Av): %0.4f " % (roc_auc_weighted))
        results_dict['ROC_AUC weighted'] = roc_auc_weighted

    except ValueError as e:
        print(e)

    f1_pos = metrics.f1_score(y, scores,average='binary')
    if verbose: print("POS f1: %0.4f  " % (f1_pos))
    results_dict['F1'] = f1_pos
    
    av_PR = metrics.average_precision_score(y, scores) # corresponds to the area under the precision-recall curve
    if verbose: print("Av_precision (Prec-Recall AUC): %0.3f " % (av_PR))
    results_dict['Averaged precision'] = av_PR
    
    accuracy = metrics.accuracy_score(y, scores)
    if verbose: print("Accuracy: %0.3f " % (accuracy))
    results_dict['Accuracy'] = accuracy
    
    precision,recall,fscore,support = metrics.precision_recall_fscore_support(y, scores,average='binary')
    if verbose: print("Precision: %0.3f " % (precision))
    results_dict['Precision'] = precision
    
    if verbose: print("Recall: %0.3f " % (recall))
    results_dict['Recall'] = recall
    
    # if verbose: print("fscore(fBeta): %0.4f  [%s]" % (fscore, label))
    mcc = metrics.matthews_corrcoef(y, scores)
    if verbose: print("MCC: %0.3f " % (mcc))
    results_dict['Matthews correlation coefficient'] = mcc

    results_dict = {k:round(float(v),4) for k, v in results_dict.items()}
    return results_dict
예제 #29
0
def get_error(est_track, true_track):
    """
    """
    
    if est_track.ndim > 1:
        true_track = true_track.reshape((true_track.shape[0],1))
    
    error = np.recarray(shape=est_track.shape,
                        dtype=[('position', float),
                               ('orientation', float),
                               ('orientation_weighted', float)])
    
    # Position error
    pos_err = (true_track.x - est_track.x)**2 + (true_track.y - est_track.y)**2
    error.position = np.sqrt(pos_err)
    
    # Orientation error
    error.orientation = anglediff(true_track.angle, est_track.angle, units='deg')    
    error.orientation_weighted = anglediff(true_track.angle, est_track.angle_w, units='deg')
    
    descr = {}
    bix = np.logical_not(np.isnan(error.orientation))
    descr['orientation_median'] = np.median(np.abs(error.orientation[bix]))
    descr['orientation_mean'] = np.mean(np.abs(error.orientation[bix]))
    bix = np.logical_not(np.isnan(error.orientation_weighted))
    descr['orientation_weighted_median'] = np.nanmedian(np.abs(error.orientation_weighted[bix]))
    descr['orientation_weighted_mean'] = np.nanmean(np.abs(error.orientation_weighted[bix]))
    # no angle
    true_no_angle = np.isnan(true_track.angle)
    est_no_angle = np.isnan(est_track.angle)
    agree = np.logical_and(true_no_angle, est_no_angle)
    disagree = np.logical_xor(true_no_angle, est_no_angle)
    both = np.logical_or(true_no_angle, est_no_angle)
    #ipdb.set_trace()
    descr['no_angle_auc'] = roc_auc_score(true_no_angle, est_no_angle)
    descr['no_angle_mcc'] = matthews_corrcoef(true_no_angle, est_no_angle)
    descr['no_angle_brier'] = brier_score_loss(true_no_angle, est_no_angle)    
    descr['no_angle_acc'] = agree.sum()/both.sum()
    descr['no_angle_p_per_frame'] = disagree.sum()/disagree.shape[0]
    descr['position_median'] = np.median(error.position)
    descr['position_mean'] = np.mean(error.position)
    
    #print('True frequency of angle-does-not-apply:',
     #     true_no_angle.sum()/true_no_angle.shape[0])
    
    #print('Estimated frequency of angle-does-not-apply:',
     #     est_no_angle.sum()/est_no_angle.shape[0])    

    return error, descr
예제 #30
0
def evaluate(labels, predictions, label_names, min_label):
    if len(label_names) > 2:
        labels = labels.ravel()
        predictions = predictions.ravel()
        precision = metrics.precision_score(labels, predictions)
        recall = metrics.recall_score(labels, predictions)
        mcc = metrics.matthews_corrcoef(labels, predictions)
        f_score = metrics.f1_score(labels, predictions)
        accuracy = metrics.accuracy_score(labels, predictions)
    else:
        precision = metrics.precision_score(labels, predictions,
                                            pos_label=min_label)
        recall = metrics.recall_score(labels, predictions,
                                      pos_label=min_label)
        f_score = metrics.f1_score(labels, predictions, pos_label=min_label)
        mcc = metrics.matthews_corrcoef(labels, predictions)
        accuracy = metrics.accuracy_score(labels, predictions)

    print("Accuracy :", "{: 0.3f}".format(accuracy))
    print("Precision:", "{: 0.3f}".format(precision))
    print("Recall   :", "{: 0.3f}".format(recall))
    print("F-Score  :", "{: 0.3f}".format(f_score))
    print("MCC-Score:", "{: 0.3f}".format(mcc))
    return accuracy, precision, recall, f_score, mcc
예제 #31
0
from sklearn import metrics
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import roc_auc_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import roc_curve
from matplotlib import pyplot
print(confusion_matrix(ytest, y_predict))
print(classification_report(ytest, y_predict))
print(accuracy_score(ytest, y_predict))
print(balanced_accuracy_score(ytest, y_predict))
print(metrics.precision_score(ytest, y_predict))
print(metrics.recall_score(ytest, y_predict))
print(metrics.f1_score(ytest, y_predict))
print(matthews_corrcoef(ytest, y_predict))
print(roc_auc_score(ytest, y_predict))
print(roc_auc_score(ytest, y_predict_vgg ))
print(roc_auc_score(ytest, y_predict))
lr_fpr, lr_tpr, _ = roc_curve(ytest, y_predict_pro)
lr_fpr_vgg, lr_tpr_vgg, _ = roc_curve(ytest, y_predict_vgg )
lr_fpr_svm, lr_tpr_svm, _ = roc_curve(ytest, y_predict_svm)
pyplot.plot(lr_fpr, lr_tpr, marker='x', label='Logistic')
pyplot.plot(lr_fpr_vgg, lr_tpr_vgg, marker='o', label='vgg')
pyplot.plot(lr_fpr_svm, lr_tpr_svm, marker='v', label='svm kernel=rbf')
pyplot.xlabel('False Positive Rate',{'size': 14})
pyplot.ylabel('True Positive Rate',{'size': 14})
# show the legend
pyplot.legend()
pyplot.tight_layout()
pyplot.savefig('./split_roc.png')
예제 #32
0
#precision alone
precision = precision_score(y_test, y_pred, average='macro')
print(precision)

#precision recall and f1 score
all = precision_recall_fscore_support(y_test, y_pred, average='macro')
print('Precision score=', all[0] * 100)
print('Recall score=', all[1] * 100)
print('F1 score=', all[2] * 100)

#accuracy
acc = accuracy_score(y_test, y_pred)
print(acc)

#MCC
mat = matthews_corrcoef(y_test, y_pred)
print(mat)

from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
import numpy as np

crossValScoreAccuracy = cross_val_score(classifier, X, Y, cv=10)
print("Mean Accuracy = ", np.mean(crossValScoreAccuracy))

precisionScorer = make_scorer(precision_score, pos_label='True')
crossValScorePrecision = cross_val_score(classifier,
                                         X,
                                         Y,
                                         cv=10,
예제 #33
0
accuracy_score = accuracy_score(true_labels, new_labels)
print("\n\nAccuracy {} %".format(round(accuracy_score * 100, 3)))

confusion_matrix = confusion_matrix(true_labels, new_labels)
print("\n\nConfusion Matrix: \n\n {}".format(confusion_matrix))

classification_report = classification_report(true_labels, new_labels)
print("\n\nClassification Scores: \n\n {}".format(classification_report))

hamming_loss = hamming_loss(true_labels, new_labels)
print("\n\nHamming Loss {}".format(hamming_loss))

jaccard_similarity_score = jaccard_similarity_score(true_labels, new_labels)
print("\n\nJaccard Similarity Score {}".format(jaccard_similarity_score))

matthews_corrcoef = matthews_corrcoef(true_labels, new_labels)
print("\n\nMatthews corrcoef {}".format(matthews_corrcoef))

zero_one_loss = zero_one_loss(true_labels, new_labels)
print("\n\nZero-One Loss {}".format(zero_one_loss))

##################################################
##### OUTPUT
##################################################

# Clustered in 93.176 seconds

# Cluster 0 labels:
# attack.    228278
# normal.      5770
# dtype: int64
예제 #34
0
def evalmcc(preds, dtrain):
    labels = dtrain.get_label()
    return 'MCC', matthews_corrcoef(labels, preds > THRESHOLD)
예제 #35
0
    def regression_cv(self, cv_path='tie_strengths/cv_config.yaml'):
        """
        Performs CV at different levels of overlap
        """
        try:
            conf = yaml.load(open(cv_path))
        except:
            self.paths['cv_path'] = os.path.join(self.run_path,
                                                 'cv_config.yaml')
            conf = yaml.load(open(self.paths['cv_path']))
        params = self.get_variable_transformations(conf['params'])
        cols_pttrns = params.keys()

        try:  #TODO: change this (for db)
            self.paths['full_df']
        except:
            self.paths['full_df'] = os.path.join(self.run_path, 'full_df.txt')

        df = pd.read_table(self.paths['full_df'], sep=' ')
        df = df[df.c_wkn_t > 2]

        print('Table Read \n')
        cols_dic = self.get_cols_dic(cols_pttrns,
                                     df.columns)  # link cols with patterns

        # TODO: add this to a diff function, it's different preprocessing
        pttrn = '_wk(n|l)_(\d+|t|l)'
        df_nas = {col: 0. for col in df.columns if re_search(pttrn, col)}

        df = df.fillna(value=df_nas)
        print('NAs filled\n')
        wkn_cols = [
            n for n, col in enumerate(df.columns)
            if re_search('c_wkn_\d+', col)
        ]
        wkl_cols = [
            n for n, col in enumerate(df.columns)
            if re_search('c_wkl_\d+', col)
        ]
        wks_cols = [
            n for n, col in enumerate(df.columns)
            if re_search('s_wkn_\d+', col)
        ]

        # TODO: check if its faster to apply diff function
        df.loc[:, 'prop_len'] = get_prop_len(df['c_wkl_l'], df['deg_0'],
                                             df['deg_1'], df['n_len_0'],
                                             df['n_len_1'])

        #df.loc[:, 'c_l_dist'] = df.apply(lambda x: np.dot(x[wkn_cols], x[wkl_cols]), axis=1)
        print('First Variable\n')
        del df['c_wkn_0']
        del df['c_wkl_0']
        #del df['s_wkn_0']
        try:
            del df['0']
        except:
            pass
        del df['1']
        del df['n_ij']
        del df['deg_0']
        del df['deg_1']
        try:
            del df['0_1']
        except:
            pass
        try:
            del df['1_1']
        except:
            pass
        try:
            del df['0_0']
        except:
            pass

        df.dropna(inplace=True)
        self.paths['cv_class_stats'] = os.path.join(self.run_path,
                                                    'cv_class_det0_stats.csv')
        w = open(self.paths['cv_class_stats'], 'wb')
        w.write(' '.join([
            'alpha', 'num_1', 'num_1_pred', 'accuracy', 'f1', 'matthews',
            'precision', 'recall'
        ]) + '\n')
        w.close()
        y = df['ovrl']
        del df['ovrl']
        print("Obtaining models\n")
        alphas = [0.0, 0.001, 0.002, 0.004, 0.005, 0.01, 0.015] + list(
            np.arange(0.02, 0.1, .01)) + list(np.arange(0.1, .5, .05)) + list(
                np.arange(.5, .9, 0.1)) + list(np.arange(.09, 1, .01))
        for alpha in alphas:
            y_c = y.apply(lambda x: self._ifelse(x <= alpha, 1, 0))
            x_train, x_test, y_train, y_test = train_test_split(df,
                                                                y_c,
                                                                test_size=0.5)
            rf = RandomForestClassifier()
            rf.fit(x_train, y_train)
            y_pred = rf.predict(x_test)
            ac = accuracy_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)
            mth = matthews_corrcoef(y_test, y_pred)
            prc = precision_score(y_test, y_pred)
            rec = recall_score(y_test, y_pred)
            self.write_class_results(alpha, sum(y_c), sum(y_pred), ac, f1, mth,
                                     prc, rec)
            print(str(alpha) + '\n')
예제 #36
0
파일: plot.py 프로젝트: keukentrap/thesis
def plot(y_true,y_pred,y_proba,plot_title):
    #classes = ['China', 'Russia', 'North-Korea', 'USA', 'Pakistan'] 
    #classes = ["APT-{}".format(i+1) for i in np.unique(y_true)]
    y_uniq = np.unique(y_true)
    classes = ["Country {}".format(i) for i in y_uniq]

    # CORRECT ONE
    # classes = ["APT 1", "APT 10", "APT 19", "APT 21", "APT 28", "APT 29", "APT 30", "DarkHotel", "Energetic Bear", "Equation Group", "Gorgon Group", "Winnti"]
    
    # WRONG ONE
    # classes = ["APT 1", "APT 10", "APT 19", "APT 21", "APT 30", "Winnti", "APT 28", "APT 29", "Energetic Bear", "DarkHotel", "Equation Group", "Gorgon Group"]
    
    # classes = ["Country 0,1 & 2", "Country 4", "Country 3"]
    #classes = ['Asia','Pakistan','USA']
    #classes = ['China', 'North-Korea']
    location = get_plot_directory(plot_title)

    # cnf_matrix = confusion_matrix(y_true, pred,labels=range(5))
    np.set_printoptions(precision=2)

    # BROKEN
    # norm = skplt.metrics.plot_confusion_matrix(y_true,y_pred,
    #                             #    classes=classes,
    #                                normalize=True,
    #                                title = plot_title + " (normalized)")
    # norm.set_ylim(len(classes)-0.5, -0.5)
    # plt.savefig(location + "normalized.pdf")
    norm = plot_confusion_matrix(y_true,y_pred, 
                                 normalize=True, 
                                 classes=classes, 
                                 title=plot_title + " (normalized)")
    
    norm.savefig(location + "normalized.pdf")
    plt.clf()

    
    # BROKEN
    # skplt.metrics.plot_confusion_matrix(y_true,y_pred,
    #                             #    classes=classes,
    #                                normalize=False,
    #                                title = plot_title + " (normalized)")
    # plt.savefig(location + "normalized.pdf")

    not_norm = plot_confusion_matrix(y_true,y_pred, 
                                     normalize=False, 
                                     classes=classes, 
                                     title=plot_title)

    not_norm.savefig(location + "not_normalized.pdf")

    #print(np.round(pred))
    acc = accuracy_score(y_true,np.round(y_pred))
    print(acc)

    print(classification_report(y_true, y_pred, digits=3))

    (TP, FP, TN, FN) = perf_measure(y_true, y_pred)
    precision, recall, average_precision = calculate_precision_recall(y_true, y_pred)
    print("recall ", recall)
    print("precision ", precision)

    kappa = cohen_kappa_score(y_true, y_pred)
    print("Cohen's Kappa statistic ", kappa)


    matthew = matthews_corrcoef(y_true,y_pred)
    print("Matthews correlation coeffiecient ", matthew)

    # Multi-class ROC
    #pred = np.eye(len(classes))[y_pred]
    skplt.metrics.plot_roc(y_true,y_proba,plot_micro=False,plot_macro=True,figsize=None)
    plt.savefig(location + "roc.pdf")

    skplt.metrics.plot_precision_recall(y_true,y_proba)
    plt.savefig(location + "precision_recall_curve.pdf")

    # Binary ROC
    # fpr, tpr, threshold = roc_curve(y_true,y_pred)
    # roc_auc = auc(fpr,tpr)
    # plt.clf()
    # plt.title('Receiver Operating Characteristic')
    # plt.plot(fpr,tpr,'b', label = 'AUC = {0:.2f}'.format(roc_auc))
    # plt.legend(loc = 'lower right')
    # plt.plot([0,1], [0,1], 'r--')
    # plt.xlim([0,1])
    # plt.ylim([0,1])
    # plt.xlabel('True Positive Rate')
    # plt.ylabel('False Positive Rate')
    # plt.savefig(location + "roc.pdf")
    #plt.show()

    with open('../saved/history.pkl', 'rb') as f:
            history = pickle.load(f)
            # print(history.keys())
            plot_metrics(history, location)

    with open(location + "scores.txt", 'w') as f:
        f.write("prediction dataset size: {}\n".format(len(y_true)))
        f.write("accuracy: {}\n".format(acc))
        f.write("recall: {}\n".format(recall))
        f.write("precision: {}\n".format(precision))
        f.write("kappa: {}\n".format(kappa))
        f.write("matthew corcoef: {}\n".format(matthew))
        # f.write("ROC\ntpr: {}\nfpr: {}\nAUC: {}".format(tpr,fpr,roc_auc))
        f.write(classification_report(y_true, y_pred, digits=3))
예제 #37
0
def main():
    df = pd.read_excel("..\products_allshops_dataset.xlsx",
                       names=['produkt', 'kategoria'])
    np.random.seed(5)
    df = df.reindex(np.random.permutation(df.index))
    labels = np.unique(df['kategoria'])

    df_train = df[3000:].copy()
    df_valid = df[1500:3000].copy()
    df_test = df[0:1500].copy()
    X = df_train['produkt']
    y = df_train['kategoria']
    X_valid = df_valid['produkt']
    y_valid = df_valid['kategoria']
    X_test = df_test['produkt']
    y_test = df_test['kategoria']

    stop_words_file = '..\polish_stopwords.txt'
    # Getting the list of polish stop words
    with open(stop_words_file, mode='r') as stop_words:
        stop_words_list = stop_words.read().split('\n')

    # Pipeline for feature vectorizing with the GridsearchCV results included
    vect_pipeline = Pipeline([
        ('vect',
         TfidfVectorizer(max_df=0.1,
                         ngram_range=(1, 2),
                         stop_words=stop_words_list,
                         sublinear_tf=True)),
        ('tfidf',
         TfidfTransformer(norm='l2',
                          smooth_idf=True,
                          sublinear_tf=False,
                          use_idf=True)),
    ])
    # Pipeline for Naive Bayes model with the GridsearchCV results included
    mnb_pipeline = Pipeline([
        ('vectpip', vect_pipeline),
        ('clf', MultinomialNB(alpha=0.5, fit_prior=False)),
    ])

    # Fit Naive Bayes model according to train data
    mnb_pipeline.fit(X, y)

    # Get the predicted codes for training set, validation and test set
    predicted_train = mnb_pipeline.predict(X)
    predicted_test = mnb_pipeline.predict(X_test)
    predicted_valid = mnb_pipeline.predict(X_valid)

    # Compare train, test and validation accuracy in case of overfitting
    accuracy_train = accuracy_score(y, predicted_train)
    accuracy_test = accuracy_score(y_test, predicted_test)
    accuracy_valid = accuracy_score(y_valid, predicted_valid)

    # Generate measures for each class of the classification for the training, validation and testing group
    report_train = classification_report(y, predicted_train, labels=labels)
    report_test = classification_report(y_test, predicted_test, labels=labels)
    report_valid = classification_report(y_valid,
                                         predicted_valid,
                                         labels=labels)
    """
    Check f1 score for training, validation and testing group.
    When true positive + false positive == 0 or true positive + false negative == 0,
    f-score returns 0 and raises UndefinedMetricWarning.
    """
    f1_score_micro_train = f1_score(y, predicted_train, average='micro')
    f1_score_micro_test = f1_score(y_test, predicted_test, average='micro')
    f1_score_micro_valid = f1_score(y_valid, predicted_valid, average='micro')

    # Count the Matthews correlation coefficient for the training, validation and testing group
    mcc_train = matthews_corrcoef(y, predicted_train)
    mcc_test = matthews_corrcoef(y_test, predicted_test)
    mcc_valid = matthews_corrcoef(y_valid, predicted_valid)

    # Let's summarize and print all the results for the Naive Bayes classifier according to the dataset sort
    print("\nGeneral metrics for")
    print("TRAINING DATA")
    print("=" * 30)
    print("accuracy = {}".format(accuracy_train))
    print("F1-score = {}".format(f1_score_micro_train))
    print("MCC = {}\n".format(mcc_train))
    print("*" * 10, "CLASSIFICATION REPORT", "*" * 10)
    print(report_train)
    print("=" * 30)
    print("General metrics for")
    print("VALIDATION DATA")
    print("=" * 30)
    print("accuracy = {}".format(accuracy_valid))
    print("F1-score = {}".format(f1_score_micro_valid))
    print("MCC = {}\n".format(mcc_valid))
    print("*" * 10, "CLASSIFICATION REPORT", "*" * 10)
    print(report_valid)
    print("=" * 30)
    print("General metrics for")
    print("TEST DATA")
    print("=" * 30)
    print("accuracy = {}".format(accuracy_test))
    print("F1-score = {}".format(f1_score_micro_test))
    print("MCC = {}\n".format(mcc_test))
    print("*" * 10, "CLASSIFICATION REPORT", "*" * 10)
    print(report_test)
    """
    The commented code below can be used to export the fitted model into joblib file.
    Next it can be loaded to other files or applications.
    """
    # Export the fitted classifier to the file that can be used in applications to classify products
    # and get the probabilities of predicted categories
    # from joblib import dump
    # dump(mnb_pipeline, 'naive_bayes.joblib')
    """
    Test data and their predicted values can be saved into the xlsx file.
    It is also possible to add new columns - for example the most probable category and its probability.
    """
    df_test['Autocode'] = predicted_test
    predicted_prob = mnb_pipeline.predict_proba(X_test)
    df_test['Probability'] = predicted_prob.max(axis=1)
    df_test.to_excel("nb_autocode.xlsx")
    """
    It is also possible to create confusion matrix for each data set with the use of Seaborn library.
    It shows how accurate does the classifier predicts the labels versus labels in the initial dataset (test or validation).
    The generated chart can be saved into the separate file.
    """
    # Drawing confusion matrix for the test and validation group
    cm_test = draw_cmatrix(y_test, predicted_test, labels, "test")
    cm_valid = draw_cmatrix(y_valid, predicted_valid, labels, "validation")
예제 #38
0
def matt(y_test,pred):
    return matthews_corrcoef(y_test,pred) 
예제 #39
0
 y = tr_y_b
 
 round_i = 0
 while True:
     # conditions to stop searching for models
     # 1. have at least one candiate model whose mcc bigger than 0
     # 2. and either
     # 2.1. reach the max round number
     # 2.2. or achieve the target mcc value 
     if best_model_mcc!=0 and (round_i >= tr_max_round_nu or best_model_mcc>=tr_mcc_target):
         break
     print('tree',str(tree_id)+'.'+str(round_i),'-->training...',end='')
     tre = linear_model.SGDClassifier().fit(x,y)
    
     y_pred = tre.predict(x)
     mcc = matthews_corrcoef(y, y_pred) 
     print(mcc,end='')
     print('...val...',end='')
     y_pred = tre.predict(sgd_val_x)
     mcc = matthews_corrcoef(sgd_val_y, y_pred) 
     print(mcc,end='')
     
     if mcc > best_model_mcc:
         best_model = tre
         best_model_mcc = mcc
         print('(best)',end='')
     print()
     round_i +=1
 forest.append(best_model)
 tree_id +=1
 
예제 #40
0
              callbacks=[check],
              validation_data=(x_test, y_test))

out = model.predict_proba(x_test)
out = np.array(out)

threshold = np.arange(0.001, 0.999, 0.001)

acc = []
accuracies = []
best_threshold = np.zeros(out.shape[1])
for i in range(out.shape[1]):
    y_prob = np.array(out[:, i])
    for j in threshold:
        y_pred = [1 if prob >= j else 0 for prob in y_prob]
        acc.append(matthews_corrcoef(y_test[:, i], y_pred))
    acc = np.array(acc)
    index = np.where(acc == acc.max())
    accuracies.append(acc.max())
    best_threshold[i] = threshold[index[0][0]]
    acc = []

    print("best thresholds", best_threshold)
    y_pred = np.array([[
        1 if out[i, j] >= best_threshold[j] else 0
        for j in range(y_test.shape[1])
    ] for i in range(len(y_test))])

    print("-" * 40)
    print("Matthews Correlation Coefficient")
    print("Class wise accuracies")
예제 #41
0
def do_eval(model, logger, args, device, tr_loss, nb_tr_steps, global_step,
            processor, label_list, tokenizer, eval_dataloader, task_id, i,
            epoch_num):

    model.eval()
    all_pred = []
    all_label = []

    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        segment_ids = segment_ids.to(device)
        label_ids = label_ids.to(device)
        with torch.no_grad():
            tmp_eval_loss, logits = model(input_ids, segment_ids, input_mask,
                                          i, task_id, label_ids)

        if task_id == 'cola':
            logits = logits.detach()
            label_ids_np = label_ids.to('cpu').numpy()
            _, preds = logits.max(dim=1)
            tmp_eval_accuracy = matthews_corrcoef(label_ids_np,
                                                  preds.data.cpu().numpy())

            _, predicted = torch.max(logits.cpu().data, 1)
            all_pred.extend(predicted)
            all_label.extend(label_ids.cpu())

        elif task_id == 'sts':
            logits = logits.detach()
            label_ids = label_ids.to('cpu').numpy()
            logits = logits.squeeze(-1).data.cpu().tolist()
            logits = [min(max(0., pred * 1.), 1.) for pred in logits]
            tmp_eval_accuracy = pearsonr(logits, label_ids)[0]
        else:
            logits = logits.detach().cpu().numpy()
            label_ids = label_ids.to('cpu').numpy()
            tmp_eval_accuracy = accuracy(logits, label_ids)

        eval_loss += tmp_eval_loss.mean().item()
        if task_id == 'cola' or task_id == 'sts':
            eval_accuracy += tmp_eval_accuracy * input_ids.size(0)
        else:
            eval_accuracy += tmp_eval_accuracy

        nb_eval_examples += input_ids.size(0)
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = accuracy_score(all_label, all_pred)

    result = {
        'Eval_loss': eval_loss,
        'Eval_accuracy': accuracy_score(all_label, all_pred),
        'Eval_f1': f1_score(all_label, all_pred, average='macro'),
        'Global_step': global_step,
        'Loss': tr_loss / nb_tr_steps
    }

    logger.info("******** TASK %s Eval Results ********", args.data_dirs[i])
    for key in sorted(result.keys()):
        logger.info("  %s = %s", key, str(result[key]))
    return eval_accuracy, result
예제 #42
0
    plot_confusion_matrix(cm, classes=class_names, title=plot_name,ax=ax4,
                          fontsize=font_size)
    fig4.savefig((sub_dir + 'Confusion Matrix.png'), dpi=fig4.dpi)
    plt.close()
    cm_stack = cm + cm_stack
    cm_stats[:, :, split] = cm
    
    # Get accuracy of each cm
    accuracy[split] = 100 * sum(np.diagonal(cm)) / sum(sum(cm))
    
    # Write to text file
    with open((sub_dir + 'Accuracy.txt'), "w") as output:
        output.write(str(accuracy[split]))
        
    #Compute Matthews correlation coefficient
    MCC[split] = matthews_corrcoef(test_dict['GT'],test_dict['Predictions'])
    
    # Write to text file
    with open((sub_dir + 'MCC.txt'), "w") as output:
        output.write(str(MCC[split]))

    print('**********Run ' + str(split+1) + ' Finished**********')


directory = os.path.dirname(os.path.dirname(sub_dir)) + '/'   
np.set_printoptions(precision=2)
fig5, ax5 = plt.subplots(figsize=(fig_size, fig_size))
plot_avg_confusion_matrix(cm_stats, classes=class_names, 
                          title=avg_plot_name,ax=ax5,fontsize=font_size)
fig5.savefig((directory + 'Average Confusion Matrix.png'), dpi=fig5.dpi)
plt.close()
예제 #43
0
                    validation_data=[x_test, y_test],
                    callbacks=[tensorboard])

# Prediction and ROC/ AUC curve plotting
y_pred = model.predict(x_test)
fpr_keras, tpr_keras, thresholds_keras = roc_curve(np.ravel(y_test),
                                                   np.ravel(y_pred))
auc_keras = auc(fpr_keras, tpr_keras)

plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_keras, tpr_keras, label='Keras (area = {:.3f})'.format(auc_keras))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
plt.show()

test_loss, test_acc = model.evaluate(x_test, y_test, batch_size=batch_size)

model.save("BDLSTM.h5")

print('Test accuracy :', test_acc, 'Test Loss :', test_loss)
print('matthews correlation coefficient ',
      matthews_corrcoef(y_test, np.ravel(y_pred.round())))
print(
    classification_report(y_test,
                          np.ravel(y_pred.round()),
                          target_names=['class 1', 'class 2']))
print('r2 score ', r2_score(y_test, np.ravel(y_pred.round())))
예제 #44
0
# Spearmans Rank correlation matrix
training_correlation = training_categorical.drop(
    ['Embarked', 'Deck', 'Title', 'Stage', 'Died'], axis=1)
spearman_matrix = training_correlation.corr(method="spearman")
fig = plt.figure(figsize=(12, 10))
sns.heatmap(spearman_matrix, annot=True)

# Pearson's correlation
pearson_matrix = training_correlation.corr(method="pearson")
fig = plt.figure(figsize=(12, 10))
sns.heatmap(pearson_matrix, annot=True)

# Phi coefficient to measure association between binary variables
print(
    matthews_corrcoef(training_categorical['Sex'],
                      training_categorical['Survived']))
print(
    matthews_corrcoef(training_categorical['IsAlone'],
                      training_categorical['Survived']))

# Check proportion of each sex that survived.
training_data.groupby('Sex').agg('mean')[['Survived',
                                          'Died']].plot(kind='bar',
                                                        figsize=(25, 7),
                                                        stacked=True)

# Check proportion of each Pclass that survived.
training_data.groupby('Pclass').agg('mean')[['Survived',
                                             'Died']].plot(kind='bar',
                                                           figsize=(25, 7),
                                                           stacked=True)
    if slider1.reached_end_of_list(): break

#create a confusion matrix
from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(y_test, y_pred)
print(conf_mat)

#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# Model Accuracy, how often is the classifier correct?
print("Accuracy train:", metrics.accuracy_score(y_train, test))
print("Accuracy test:", metrics.accuracy_score(y_test, y_pred))

#MCC
print("MCC test:", metrics.matthews_corrcoef(y_test, y_pred))

# The recall is intuitively the ability of the classifier to find all the positive samples.
print(
    "Recall test average:",
    metrics.recall_score(y_test, y_pred, average='weighted', zero_division=1))

#Precision
print(
    "precision test average:",
    metrics.precision_score(y_test,
                            y_pred,
                            average='weighted',
                            zero_division=1))

#F1-score
예제 #46
0
def evaluate(model,
             test_x,
             test_y,
             output_folder,
             title,
             class_specific=False,
             all_labels=None,
             weight_vector=None):
    # Ensure that labels is an np array
    if all_labels is None:
        labels = None
    else:
        labels = list(all_labels)

    if weight_vector is None:
        y_predicted = model.predict(test_x)
        y_predicted_max = np.argmax(y_predicted, axis=1)
    else:
        # Variant Output Shift
        y_predicted = model.predict(test_x)
        predicted_shift = list()
        for e in y_predicted:
            predicted_shift.append(shift_output(e, weight_vector))
        y_predicted_max = np.argmax(predicted_shift, axis=1)

    y_test_max = np.argmax(test_y, axis=1)

    # Print classification report
    report = classification_report(y_test_max,
                                   y_predicted_max,
                                   labels=labels,
                                   output_dict=True,
                                   digits=5)
    report_df = pd.DataFrame(report)
    report_df.to_csv(os.path.join(output_folder, 'report_' + title + '.csv'),
                     sep=' ',
                     header=True,
                     mode='a')

    # Print confusion matrix
    cm = confusion_matrix(y_test_max, y_predicted_max, labels=labels)
    cm_df = pd.DataFrame(cm)
    cm_df.to_csv(os.path.join(output_folder, 'cm_' + title + '.csv'),
                 sep=' ',
                 header=True,
                 mode='a')

    metrics = dict()

    # Evaluate further metrics
    # =============================================================================
    #    Balanced Accuracy Score
    # =============================================================================
    metrics['Balanced Accuracy Score'] = balanced_accuracy_score(
        y_test_max, y_predicted_max)

    # =============================================================================
    #    Cohen Kappa Score
    # =============================================================================
    metrics['Cohen Kappa Score (No weighted)'] = cohen_kappa_score(
        y_predicted_max, y_test_max, weights=None)
    metrics['Cohen Kappa Score (Linear weighted)'] = cohen_kappa_score(
        y_predicted_max, y_test_max, weights='linear')
    metrics['Cohen Kappa Score (Quadratic weighted)'] = cohen_kappa_score(
        y_predicted_max, y_test_max, weights='quadratic')

    # =============================================================================
    #    Hinge Loss
    # =============================================================================
    metrics['Hinge Loss'] = hinge_loss(y_test_max, y_predicted, labels=labels)

    # =============================================================================
    #    Matthews Correlation Coefficient
    # =============================================================================
    metrics['Matthews Correlation Coefficient'] = matthews_corrcoef(
        y_test_max, y_predicted_max)

    # =============================================================================
    #    Top k Accuracy Score (does not work, To DO)
    # =============================================================================
    # print("\n Top k Accuracy: ")
    # print(top_k_accuracy_score(y_test_max, y_predicted_max, k=5))

    # =============================================================================
    #    The following also work in the multi label case
    # =============================================================================

    # =============================================================================
    #    Accuracy Score
    # =============================================================================
    metrics['Accuracy Score'] = accuracy_score(y_test_max, y_predicted_max)

    # =============================================================================
    #    F1 Score
    # =============================================================================
    metrics['F Score (Micro)'] = f1_score(y_test_max,
                                          y_predicted_max,
                                          average='micro')
    metrics['F Score (Macro)'] = f1_score(y_test_max,
                                          y_predicted_max,
                                          average='macro')
    metrics['F Score (Weighted)'] = f1_score(y_test_max,
                                             y_predicted_max,
                                             average='weighted')
    if class_specific:
        metrics['F Score (None, i.e. for each class)'] = f1_score(
            y_test_max, y_predicted_max, average=None)

    # =============================================================================
    #    ROC AUC Score (in case of multi class sklearn only support macro and weighted averages)
    # =============================================================================
    # ROC AUC only works if each label occurs at least one time. Hence, we need to catch a exception
    print(y_test_max)
    try:
        metrics['ROC AUC Score (OVR) Macro'] = roc_auc_score(y_test_max,
                                                             y_predicted,
                                                             multi_class='ovr',
                                                             average='macro',
                                                             labels=labels)
        metrics['ROC AUC Score (OVR) Weighted'] = roc_auc_score(
            y_test_max,
            y_predicted,
            multi_class='ovr',
            average='weighted',
            labels=labels)
        metrics['ROC AUC Score (OVO) Macro'] = roc_auc_score(y_test_max,
                                                             y_predicted,
                                                             multi_class='ovo',
                                                             average='macro',
                                                             labels=labels)
        metrics['ROC AUC Score (OVO) Weighted'] = roc_auc_score(
            y_test_max,
            y_predicted,
            multi_class='ovo',
            average='weighted',
            labels=labels)
    except:
        print("Cannot calculate ROC AUC Score!")
        pass

    # =============================================================================
    #    F Beta Score
    # =============================================================================
    metrics['F Beta Score (Micro) b=0.5'] = fbeta_score(y_test_max,
                                                        y_predicted_max,
                                                        average='micro',
                                                        beta=0.5)
    metrics['F Beta Score (Macro) b=0.5'] = fbeta_score(y_test_max,
                                                        y_predicted_max,
                                                        average='macro',
                                                        beta=0.5)
    metrics['F Beta Score (Weighted) b=0.5'] = fbeta_score(y_test_max,
                                                           y_predicted_max,
                                                           average='weighted',
                                                           beta=0.5)
    if class_specific:
        metrics[
            'F Beta Score (None, i.e. for each class) b=0.5'] = fbeta_score(
                y_test_max, y_predicted_max, average=None, beta=0.5)

    metrics['F Beta Score (Micro) b=1.5'] = fbeta_score(y_test_max,
                                                        y_predicted_max,
                                                        average='micro',
                                                        beta=1.5)
    metrics['F Beta Score (Macro) b=1.5'] = fbeta_score(y_test_max,
                                                        y_predicted_max,
                                                        average='macro',
                                                        beta=1.5)
    metrics['F Beta Score (Weighted) b=1.5'] = fbeta_score(y_test_max,
                                                           y_predicted_max,
                                                           average='weighted',
                                                           beta=1.5)
    if class_specific:
        metrics[
            'F Beta Score (None, i.e. for each class) b=1.5'] = fbeta_score(
                y_test_max, y_predicted_max, average=None, beta=1.5)

    # =============================================================================
    #    Hamming Loss
    # =============================================================================
    metrics['Hamming Loss'] = hamming_loss(y_test_max, y_predicted_max)

    # =============================================================================
    #    Jaccard Score
    # =============================================================================
    metrics['Jaccard Score (Micro)'] = jaccard_score(y_test_max,
                                                     y_predicted_max,
                                                     average='micro')
    metrics['Jaccard Score (Macro)'] = jaccard_score(y_test_max,
                                                     y_predicted_max,
                                                     average='macro')
    metrics['Jaccard Score (Weighted)'] = jaccard_score(y_test_max,
                                                        y_predicted_max,
                                                        average='weighted')
    if class_specific:
        metrics['Jaccard Score (None, i.e. for each class)'] = jaccard_score(
            y_test_max, y_predicted_max, average=None)

    # =============================================================================
    #    Log Loss
    # =============================================================================
    metrics['Logg Loss'] = log_loss(y_test_max, y_predicted, labels=labels)

    # =============================================================================
    #    Precision Score
    # =============================================================================
    metrics['Precision Score (Micro)'] = precision_score(y_test_max,
                                                         y_predicted_max,
                                                         average='micro')
    metrics['Precision Score (Macro)'] = precision_score(y_test_max,
                                                         y_predicted_max,
                                                         average='macro')
    metrics['Precision Score (Weighted)'] = precision_score(y_test_max,
                                                            y_predicted_max,
                                                            average='weighted')
    if class_specific:
        metrics[
            'Precision Score (None, i.e. for each class)'] = precision_score(
                y_test_max, y_predicted_max, average=None)

    # =============================================================================
    #    Specificity Score
    # =============================================================================
    metrics['Specificity Score (Micro)'] = specificity_score(y_test_max,
                                                             y_predicted_max,
                                                             average='micro')
    metrics['Specificity Score (Macro)'] = specificity_score(y_test_max,
                                                             y_predicted_max,
                                                             average='macro')
    metrics['Specificity Score (Weighted)'] = specificity_score(
        y_test_max, y_predicted_max, average='weighted')
    if class_specific:
        metrics[
            'Specificity Score (None, i.e. for each class)'] = specificity_score(
                y_test_max, y_predicted_max, average=None)

    # =============================================================================
    #    Recall Score (also named Sensitivity Score). Hence, the Sensitivity Score values
    #   should be the same as the Recall Score values
    # =============================================================================
    metrics['Recall Score (Micro)'] = recall_score(y_test_max,
                                                   y_predicted_max,
                                                   average='micro')
    metrics['Recall Score (Macro)'] = recall_score(y_test_max,
                                                   y_predicted_max,
                                                   average='macro')
    metrics['Recall Score (Weighted)'] = recall_score(y_test_max,
                                                      y_predicted_max,
                                                      average='weighted')
    if class_specific:
        metrics['Recall Score (None, i.e. for each class)'] = recall_score(
            y_test_max, y_predicted_max, average=None)

    metrics['Sensitivity Score (Micro)'] = sensitivity_score(y_test_max,
                                                             y_predicted_max,
                                                             average='micro')
    metrics['Sensitivity Score (Macro)'] = sensitivity_score(y_test_max,
                                                             y_predicted_max,
                                                             average='macro')
    metrics['Sensitivity Score (Weighted)'] = sensitivity_score(
        y_test_max, y_predicted_max, average='weighted')
    if class_specific:
        metrics[
            'Sensitivity Score (None, i.e. for each class)'] = sensitivity_score(
                y_test_max, y_predicted_max, average=None)

    # =============================================================================
    #    Geometric Mean Score
    # =============================================================================
    metrics['Geometric Mean Score (Normal)'] = geometric_mean_score(
        y_test_max, y_predicted_max)
    metrics['Geometric Mean Score (Micro)'] = geometric_mean_score(
        y_test_max, y_predicted_max, average='micro')
    metrics['Geometric Mean Score (Macro)'] = geometric_mean_score(
        y_test_max, y_predicted_max, average='macro')
    metrics['Geometric Mean Score (Weighted)'] = geometric_mean_score(
        y_test_max, y_predicted_max, average='weighted')
    if class_specific:
        metrics[
            'Geometric Mean Score (None, i.e. for each class)'] = geometric_mean_score(
                y_test_max, y_predicted_max, average=None)

    # =============================================================================
    #    Zero one Loss
    # =============================================================================
    metrics['Zero One Loss'] = zero_one_loss(y_test_max, y_predicted_max)

    # =============================================================================
    #    Make Index Balanced Accuracy with
    # =============================================================================
    # print("\n MIBA with Matthews")
    # geo_mean = make_index_balanced_accuracy(alpha=0.5, squared=True)(hamming_loss)
    # print(geo_mean(y_test_max, y_predicted_max))
    return metrics
예제 #47
0
    def performance_measure(self, X, y, model, nfolds, modelIndex,
                            independent_pred_var, independent_outcome_var):
        self.nfolds = nfolds
        self.modelIndex = modelIndex

        sen_list = []
        spec_list = []
        acc_list = []
        pre_list = []
        mcc_list = []
        f1_list = []

        tpr_list = []
        mean_fpr = np.linspace(0, 1, 100)
        # auc_list = []

        # this is for independent dataset
        sen_list_independent = []
        spec_list_independent = []
        acc_list_independent = []
        pre_list_independent = []
        mcc_list_independent = []
        f1_list_independent = []

        tpr_list_independent = []
        mean_fpr_independent = np.linspace(0, 1, 100)

        skf = StratifiedKFold(n_splits=self.nfolds, random_state=423)

        for train_index, test_index in skf.split(X, y):
            # print("Train:", train_index, "Test:", test_index)
            if modelIndex == 6:
                probability_model = model.fit(X[train_index],
                                              y[train_index],
                                              steps=2000).predict_proba(
                                                  X[test_index],
                                                  as_iterable=False)
                prediction_model = model.fit(X[train_index],
                                             y[train_index],
                                             steps=2000).predict(
                                                 X[test_index],
                                                 as_iterable=False)

                fpr, tpr, thresholds = metrics.roc_curve(
                    y[test_index], probability_model[:, 1])

                tpr_list.append(interp(mean_fpr, fpr, tpr))
                tpr_list[-1][0] = 0.0

                conf_matrix = metrics.confusion_matrix(y[test_index],
                                                       prediction_model)

                # this is use to predict independent dataset
                probability_model_independent = model.fit(
                    X[train_index], y[train_index],
                    steps=2000).predict_proba(independent_pred_var,
                                              as_iterable=False)
                prediction_model_independent = model.fit(
                    X[train_index], y[train_index],
                    steps=2000).predict(independent_pred_var,
                                        as_iterable=False)
                fpr_independent, tpr_independent, thresholds_independent = metrics.roc_curve(
                    independent_outcome_var, probability_model_independent[:,
                                                                           1])

                tpr_list_independent.append(
                    interp(mean_fpr_independent, fpr_independent,
                           tpr_independent))
                tpr_list_independent[-1][0] = 0.0

                conf_matrix_independent = metrics.confusion_matrix(
                    independent_outcome_var, prediction_model_independent)

            else:
                probability_model = model.fit(X[train_index],
                                              y[train_index]).predict_proba(
                                                  X[test_index])
                prediction_model = model.fit(
                    X[train_index], y[train_index]).predict(X[test_index])

                # this use to predict independent dataset
                probability_model_independent = model.fit(
                    X[train_index],
                    y[train_index]).predict_proba(independent_pred_var)
                prediction_model_independent = model.fit(
                    X[train_index],
                    y[train_index]).predict(independent_pred_var)

                fpr, tpr, thresholds = metrics.roc_curve(
                    y[test_index], probability_model[:, 1])

                tpr_list.append(interp(mean_fpr, fpr, tpr))
                tpr_list[-1][0] = 0.0

                conf_matrix = metrics.confusion_matrix(y[test_index],
                                                       prediction_model)

                # this is for independent dataset
                fpr_independent, tpr_independent, thresholds_independent = metrics.roc_curve(
                    independent_outcome_var, probability_model_independent[:,
                                                                           1])

                tpr_list_independent.append(
                    interp(mean_fpr_independent, fpr_independent,
                           tpr_independent))
                tpr_list_independent[-1][0] = 0.0

                conf_matrix_independent = metrics.confusion_matrix(
                    independent_outcome_var, prediction_model_independent)

            new_list_CM = []

            for i in conf_matrix:
                for j in i:
                    new_list_CM.append(j)

            TP = float(new_list_CM[0])
            FP = float(new_list_CM[1])
            FN = float(new_list_CM[2])
            TN = float(new_list_CM[3])

            # print("TP:", TP, "FP:", FP, "FN:", FN,"TN:", TN)
            try:
                sensitivity = round(float(TP / (TP + FN)), 2)
            except:
                print("Error in sensitivity")
                pass
            try:
                specificity = round(float(TN / (TN + FP)), 2)
            except:
                print("Error in specificity")
                pass
            try:
                accuracy = round(float((TP + TN) / (TP + FP + FN + TN)), 2)
            except:
                print("Error in accuracy")
                pass
            try:
                precision = round(float(TP / (TP + FP)), 2)
            except:
                print("Error in precision")
                pass
            try:
                mcc = round(
                    metrics.matthews_corrcoef(y[test_index], prediction_model),
                    2)
            except:
                print("Error in mcc")
                pass
            try:
                # f1 = round(metrics.f1_score(y[test_index], prediction_model), 2)
                f1 = 2 * ((sensitivity * precision) /
                          (sensitivity + precision))
            except:
                print("Error in f1")
                pass

            # store the value in list of performance measure
            sen_list.append(sensitivity)
            spec_list.append(specificity)
            acc_list.append(accuracy)
            pre_list.append(precision)
            mcc_list.append(mcc)
            f1_list.append(f1)

            # this is for independent dataset
            new_list_CM_independent = []
            for i in conf_matrix_independent:
                for j in i:
                    new_list_CM_independent.append(j)

            TP_independent = float(new_list_CM_independent[0])
            FP_independent = float(new_list_CM_independent[1])
            FN_independent = float(new_list_CM_independent[2])
            TN_independent = float(new_list_CM_independent[3])
            # print("TP_Independent:", TP_independent, "FP_Independent:", FP_independent, "FN_Independent:", FN_independent,"TN_Independent:", TN_independent)
            try:
                sensitivity_independent = round(
                    float(TP_independent / (TP_independent + FN_independent)),
                    2)
            except:
                print("Error in sensitivity_independent")
                pass
            try:
                specificity_independent = round(
                    float(TN_independent / (TN_independent + FP_independent)),
                    2)
            except:
                print("Error in specificity_independent")
                pass
            try:
                accuracy_independent = round(
                    float((TP_independent + TN_independent) /
                          (TP_independent + FP_independent + FN_independent +
                           TN_independent)), 2)
            except:
                print("Error in accuracy_independent")
                pass
            try:
                precision_independent = round(
                    float(TP_independent / (TP_independent + FP_independent)),
                    2)
            except:
                print("Error in precision_independent")
                pass
            try:
                mcc_independent = round(
                    metrics.matthews_corrcoef(independent_outcome_var,
                                              prediction_model_independent), 2)
            except:
                print("Error in mcc_independent")
                pass
            try:
                # f1 = round(metrics.f1_score(y[test_index], prediction_model), 2)
                f1_independent = 2 * (
                    (sensitivity_independent * precision_independent) /
                    (sensitivity_independent + precision_independent))
            except:
                print("Error in f1_independent")
                pass

            # store the value in list of performance measure
            sen_list_independent.append(sensitivity_independent)
            spec_list_independent.append(specificity_independent)
            acc_list_independent.append(accuracy_independent)
            pre_list_independent.append(precision_independent)
            mcc_list_independent.append(mcc_independent)
            f1_list_independent.append(f1_independent)

        sen_mean = round(float(sum(sen_list)) / float(len(sen_list)), 3)
        spec_mean = round(float(sum(spec_list)) / float(len(spec_list)), 3)
        acc_mean = round(float(sum(acc_list)) / float(len(acc_list)), 3)
        pre_mean = round(float(sum(pre_list)) / float(len(pre_list)), 3)
        mcc_mean = round(float(sum(mcc_list)) / float(len(mcc_list)), 3)
        f1_mean = round(float(sum(f1_list)) / float(len(f1_list)), 3)

        mean_tpr = np.mean(tpr_list, axis=0)
        mean_tpr[-1] = 1.0
        mean_auc = metrics.auc(mean_fpr, mean_tpr)

        # this is for independent dataset
        sen_mean_independent = round(
            float(sum(sen_list_independent)) /
            float(len(sen_list_independent)), 3)
        spec_mean_independent = round(
            float(sum(spec_list_independent)) /
            float(len(spec_list_independent)), 3)
        acc_mean_independent = round(
            float(sum(acc_list_independent)) /
            float(len(acc_list_independent)), 3)
        pre_mean_independent = round(
            float(sum(pre_list_independent)) /
            float(len(pre_list_independent)), 3)
        mcc_mean_independent = round(
            float(sum(mcc_list_independent)) /
            float(len(mcc_list_independent)), 3)
        f1_mean_independent = round(
            float(sum(f1_list_independent)) / float(len(f1_list_independent)),
            3)

        mean_tpr_independent = np.mean(tpr_list_independent, axis=0)
        mean_tpr_independent[-1] = 1.0
        mean_auc_independent = metrics.auc(mean_fpr_independent,
                                           mean_tpr_independent)

        perf_header = ("sensitivity", "specificity", "accuracy", "precision",
                       "mcc", "f1", "auc")
        perf_value = (sen_mean, spec_mean, acc_mean, pre_mean, mcc_mean,
                      f1_mean, round(mean_auc, 3))

        # this is for independent dataset
        perf_header_independent = ("sensitivity_independent",
                                   "specificity_independent",
                                   "accuracy_independent",
                                   "precision_independent", "mcc_independent",
                                   "f1_independent", "auc_independent")
        perf_value_independent = (sen_mean_independent, spec_mean_independent,
                                  acc_mean_independent, pre_mean_independent,
                                  mcc_mean_independent, f1_mean_independent,
                                  round(mean_auc_independent, 3))
        # print("Header:",perf_header, "Value:", perf_value)

        print("Inside performance measures........")
        # print(model_list)

        return perf_header, perf_value, mean_tpr, mean_fpr, perf_header_independent, perf_value_independent, mean_tpr_independent, mean_tpr_independent
예제 #48
0
def evaluate_cola(targets, preds):
    #matthews_corrcoef
    mcc = matthews_corrcoef(targets, preds)
    print("mcc: %.3f" % mcc)
예제 #49
0
cols_to_del = []
for i in range(57914):
    if (i not in cols_anova):
        cols_to_del.append(i)

x = df.drop(columns=cols_to_del)

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

# define the keras model
model = Sequential()
model.add(Dense(64, input_dim=300, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
# compile the keras model
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
# fit the keras model on the dataset
model.fit(X_train, y_train, epochs=15, batch_size=100)

y_pred = model.predict_classes(X_test)

y_pred_seris = pd.Series(y_pred.flatten())

print("---------(Neural Network)-----------")
print(confusion_matrix(y_test, y_pred_seris))
print(classification_report(y_test, y_pred_seris))
print("MCC Score (Neural Network): ", matthews_corrcoef(y_test, y_pred_seris))
print("---------(Neural Network)-----------")
예제 #50
0
y_all_pred = []

for i in list:
    if i.split('.',1)[0] == i:
        continue
    else:
        y_real = []
        y_pred = []
        f = open(root+i)
        lines = f.readlines()
        f.close()
        for j in lines:
            y_real.append(float(j.split(' ',1)[0]))
        
        f = open(root+i.split('.',1)[0])
        lines = f.readlines()
        f.close()
        for j in lines:
            y_pred.append(float(j.split('\\',1)[0]))
        
        a, b, c, d = metrics.confusion_matrix(y_real, y_pred).ravel()
        print([i, metrics.accuracy_score(y_real, y_pred), a, b, c, d, metrics.precision_score(y_real, y_pred), metrics.recall_score(y_real, y_pred), metrics.f1_score(y_real, y_pred), matthews_corrcoef(y_real, y_pred)])
        y_all.append(y_real)
        y_all_pred.append(y_pred)

y_all=[y for x in y_all for y in x]
y_all_pred=[y for x in y_all_pred for y in x]
a, b, c, d = metrics.confusion_matrix(y_all, y_all_pred).ravel()
print([metrics.accuracy_score(y_all, y_all_pred), a, b, c, d, metrics.precision_score(y_all, y_all_pred), metrics.recall_score(y_all, y_all_pred), metrics.f1_score(y_all, y_all_pred), matthews_corrcoef(y_all, y_all_pred)])

예제 #51
0
def train_model(model,
                dataloaders,
                criterion,
                optimizer,
                scheduler,
                num_epochs=25,
                patience=50,
                mode='classification'):
    """trains a deep learning model on predicting glycan properties\n
  | Arguments:
  | :-
  | model (PyTorch object): graph neural network (such as SweetNet) for analyzing glycans
  | dataloaders (PyTorch object): dictionary of dataloader objects with keys 'train' and 'val'
  | criterion (PyTorch object): PyTorch loss function
  | optimizer (PyTorch object): PyTorch optimizer
  | scheduler (PyTorch object): PyTorch learning rate decay
  | num_epochs (int): number of epochs for training; default: 25
  | patience (int): number of epochs without improvement until early stop; default: 50
  | mode (string): 'classification' or 'regression'; default is binary classification\n
  | Returns:
  | :-
  | Returns the best model seen during training
  """
    since = time.time()
    early_stopping = EarlyStopping(patience=patience, verbose=True)
    best_model_wts = copy.deepcopy(model.state_dict())
    best_loss = 100.0
    epoch_mcc = 0
    if mode == 'classification':
        best_acc = 0.0
    else:
        best_acc = 100.0
    val_losses = []
    val_acc = []

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()
            else:
                model.eval()

            running_loss = []
            running_acc = []
            running_mcc = []
            for data in dataloaders[phase]:
                x, y, edge_index, batch = data.x, data.y, data.edge_index, data.batch
                x = x.cuda()
                y = y.cuda()
                edge_index = edge_index.cuda()
                batch = batch.cuda()
                optimizer.zero_grad()

                with torch.set_grad_enabled(phase == 'train'):
                    pred = model(x, edge_index, batch)
                    loss = criterion(pred, y)

                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                running_loss.append(loss.item())
                if mode == 'classification':
                    pred2 = np.argmax(pred.cpu().detach().numpy(), axis=1)
                    running_acc.append(
                        accuracy_score(y.cpu().detach().numpy().astype(int),
                                       pred2))
                    running_mcc.append(
                        matthews_corrcoef(y.detach().cpu().numpy(), pred2))
                else:
                    running_acc.append(y.cpu().detach().numpy(),
                                       pred.cpu().detach().numpy())

            epoch_loss = np.mean(running_loss)
            epoch_acc = np.mean(running_acc)
            if mode == 'classification':
                epoch_mcc = np.mean(running_mcc)
            print('{} Loss: {:.4f} Accuracy: {:.4f} MCC: {:.4f}'.format(
                phase, epoch_loss, epoch_acc, epoch_mcc))

            if phase == 'val' and epoch_loss <= best_loss:
                best_loss = epoch_loss
                best_model_wts = copy.deepcopy(model.state_dict())
            if mode == 'classification':
                if phase == 'val' and epoch_acc > best_acc:
                    best_acc = epoch_acc
            else:
                if phase == 'val' and epoch_acc < best_acc:
                    best_acc = epoch_acc
            if phase == 'val':
                val_losses.append(epoch_loss)
                val_acc.append(epoch_acc)
                early_stopping(epoch_loss, model)

            scheduler.step()

        if early_stopping.early_stop:
            print("Early stopping")
            break
        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val loss: {:4f}, best Accuracy score: {:.4f}'.format(
        best_loss, best_acc))
    model.load_state_dict(best_model_wts)

    ## plot loss & accuracy score over the course of training
    fig, ax = plt.subplots(nrows=2, ncols=1)
    plt.subplot(2, 1, 1)
    plt.plot(range(epoch + 1), val_losses)
    plt.title('Training of SweetNet')
    plt.ylabel('Validation Loss')
    plt.legend(['Validation Loss'], loc='best')

    plt.subplot(2, 1, 2)
    plt.plot(range(epoch + 1), val_acc)
    plt.ylabel('Validation Accuracy')
    plt.xlabel('Number of Epochs')
    plt.legend(['Validation Accuracy'], loc='best')
    return model
예제 #52
0
def my_matthews_corrcoef(y_true, y_pred):
    return matthews_corrcoef(y_true, y_pred)
예제 #53
0
def main():

    dataset_name = 'DM2'
    rep = sys.argv[1]
    split = sys.argv[2]
    print('rep:' + rep + '    split:' + split)

    train_pairs_file = 'CV/train' + str(rep) + '-' + str(split)
    test_pairs_file = 'CV/test' + str(rep) + '-' + str(split)
    valid_pairs_file = 'CV/valid' + str(rep) + '-' + str(split)

    batch_size = 32
    train_generator = DataGenerator(train_pairs_file, batch_size=batch_size)
    test_generator = DataGenerator(test_pairs_file, batch_size=batch_size)
    valid_generator = DataGenerator(valid_pairs_file, batch_size=batch_size)

    # model = build_model_without_att()
    model = build_model()
    save_model_name = 'CV/GoplusSeq' + str(rep) + '-' + str(split) + '.hdf5'

    earlyStopping = EarlyStopping(monitor='val_acc',
                                  patience=20,
                                  verbose=0,
                                  mode='max')
    save_checkpoint = ModelCheckpoint(save_model_name,
                                      save_best_only=True,
                                      monitor='val_acc',
                                      mode='max',
                                      save_weights_only=True)

    # validation_data = (valid_X, valid_Y),  verbose=1,callbacks=[earlyStopping, save_checkpoint]
    #  max_queue_size=16, workers=8, use_multiprocessing=True,
    hist = model.fit_generator(generator=train_generator,
                               validation_data=valid_generator,
                               epochs=100,
                               verbose=1,
                               callbacks=[earlyStopping, save_checkpoint])

    # model = load_model(save_model_name)
    model.load_weights(save_model_name)
    with open(test_pairs_file, 'r') as f:
        test_ppi_pairs = f.readlines()

    test_len = len(test_ppi_pairs)
    list_IDs_temp = np.arange(test_len)

    test_x, y_test = test_generator.all_data(list_IDs_temp)

    y_pred_prob = model.predict(test_x)

    y_pred = (y_pred_prob > 0.5)
    auc = metrics.roc_auc_score(y_test, y_pred_prob)
    f1 = f1_score(y_test, y_pred)
    pre = precision_score(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)

    precision, recall, _thresholds = metrics.precision_recall_curve(
        y_test, y_pred_prob)
    pr_auc = metrics.auc(recall, precision)
    mcc = matthews_corrcoef(y_test, y_pred)

    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    total = tn + fp + fn + tp
    sen = float(tp) / float(tp + fn)
    sps = float(tn) / float((tn + fp))

    tpr = float(tp) / float(tp + fn)
    fpr = float(fp) / float((tn + fp))
    print('--------------------------\n')
    print('AUC: %f' % auc)
    print('ACC: %f' % acc)
    # print("PRAUC: %f" % pr_auc)
    print('MCC : %f' % mcc)
    # print ('SEN: %f' % sen)
    # print ('SEP: %f' % sps)
    print('TPR:%f' % tpr)
    print('FPR:%f' % fpr)
    print('Pre:%f' % pre)
    print('F1:%f' % f1)
예제 #54
0
파일: core.py 프로젝트: pruksmhc/jiant-1
 def compute_metrics_from_preds_and_labels(cls, preds, labels):
     mcc = matthews_corrcoef(labels, preds)
     return Metrics(major=mcc, minor={"mcc": mcc})
예제 #55
0
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()

  prediction.append(logits)
  true_labels.append(label_ids)

print(" DONE.")

from sklearn.metrics import matthews_corrcoef

flat_prediction = [item for sublist in prediction for item in sublist]
flat_prediction = np.argmax(flat_prediction, axis=1).flatten()

flat_true_labels = [item for sublist in true_labels for item in sublist]

mcc = matthews_corrcoef(flat_true_labels, flat_prediction)

print("MCC: %.3f" %mcc)

from sklearn.metrics import accuracy_score

acc = accuracy_score(flat_true_labels, flat_prediction)

print("ACC: %.3f" %acc)

def predict(dataloader):
  prediction = []

  for batch in dataloader:
    batch = tuple(t.to(device) for t in batch)
def train_pssm(input_id, input_top, win, out_file_path):
    '''Takes pssms from X_train and trains and saves model'''

    pssm_list_train = []
    for ID in input_id:
        pssm = '../datasets/PSSM_files/PSSMs/' + ID + '.fasta.pssm'  #location of your pssms

        pssm_list_train.append(
            np.genfromtxt(pssm,
                          skip_header=3,
                          skip_footer=5,
                          usecols=range(22, 42)))
    X_train = pssm_list_train

    ###################################################

    ###################################################
    X_train_changed, array_numbering = extract_pssms(X_train,
                                                     win)  #X_train = pssm_list

    states = {'g': 1, 'B': -1}
    Y_train_changed = []
    for proteins in input_top:
        for topologies in proteins:
            y = states[topologies]
            Y_train_changed.append(y)

    x_train, x_test, y_train, y_test = train_test_split(X_train_changed,
                                                        Y_train_changed,
                                                        test_size=0.33,
                                                        random_state=42)
    seq = x_train
    top = y_train
    cross_val = 5
    labels = [1, -1]

    #   training
    clf = svm.SVC(gamma=0.001, kernel='linear', C=1.0)

    filename = '../output/pssm_model.sav'
    pickle.dump(clf, open(filename, 'wb'))

    cvs_svm = cross_val_score(clf, seq, top, cv=cross_val, n_jobs=-1)
    cvs_svm_mean = cvs_svm.mean()
    clf.fit(x_train, y_train)
    #   prediction
    y_test_top_predicted_svm = clf.predict(x_test)
    svm_classreport = classification_report(y_test, y_test_top_predicted_svm,
                                            labels)
    svm_confusionm = confusion_matrix(y_test, y_test_top_predicted_svm, labels)
    svm_mcc = matthews_corrcoef(y_test, y_test_top_predicted_svm)

    with open(
            out_file_path + str(win) +
            'winsize_pssm_based_model_scoringresults.txt', 'w') as out_file:
        out_file.write('Cross-validation scores for PSSM-SVC: ' +
                       str(cvs_svm_mean) + '\n')
        out_file.write('Matthews correlation coefficient (MCC) SVM: ' +
                       str(svm_mcc) + '\n')
        out_file.write('Classification report SVM: ' + '\n' +
                       str(svm_classreport) + '\n')
        out_file.write('Confusion matrix SVM: ' + '\n' + str(svm_confusionm) +
                       '\n')
    out_file.close()

    print(svm_classreport)
    print(svm_confusionm)
    print(svm_mcc)

    return
예제 #57
0
파일: threshold.py 프로젝트: hzhou256/py
    best_t = 0
    y_prob = get_y(y_proba, c)
    for threshold in np.arange(0.1, 1, 0.001):
        y_pred = get_label_use_thres(y_prob, threshold)
        y_true = get_y(y_train, c)

        confusion_matrix = metrics.confusion_matrix(y_true, y_pred)
        TP = confusion_matrix[0][0]
        FP = confusion_matrix[0][1]
        FN = confusion_matrix[1][0]
        TN = confusion_matrix[1][1]

        ACC = (TP + TN) / (TP + TN + FP + FN)
        SN = TP / (TP + FN)
        SP = TN / (TN + FP)
        PR = TP / (TP + FP)
        MCC = metrics.matthews_corrcoef(y_true, y_pred)
        if best_MCC < MCC:
            best_MCC = MCC
            best_ACC = ACC
            best_SP = SP
            best_SN = SN
            best_PR = PR
            best_t = threshold
    print('class_' + str(c+1) + ':')
    print('ACC =', best_ACC)
    print('SN =', best_SN)
    print('SP =', best_SP)
    print('Precision =', best_PR)
    print('MCC =', best_MCC)
    print('threshold =', best_t)
예제 #58
0
def ClassifierReport(y_true, y_preds, y_probas, img_save=False):
    '''二分类模型评估(后期可能会修改为多分类)
    真实数据和预测数据之间的各种可视化和度量
    
    parameters:
    -----------
    y_true: array_like 真实的标签,binary
    y_preds: dict or array_like. 预测的标签,binary,可以用 dict 存储多个模型的预测标签数据
    y_probas: dict or array_like. 预测的概率,0-1,可以用 dict 存储多个模型的预测标签数据
    img_save:Bool,是否直接将图片保存到本地
    
    return:
    ---------
    models_report: 各模型的各种评估数据
    conf_matrix: 各模型的混淆矩阵
    '''

    #from sklearn import metrics
    assert type(y_preds) == type(y_probas)
    if not (isinstance(y_preds, dict)):
        y_preds = {'clf': y_preds}
        y_probas = {'clf': y_probas}
    models_report = pd.DataFrame()
    conf_matrix = {}
    fig1, ax1 = plt.subplots()
    fig2, ax2 = plt.subplots()
    fig3, ax3 = plt.subplots()
    for clf in y_preds:
        y_pred = y_preds[clf]
        y_proba = y_probas[clf]
        try:
            kl_div_score = entropyc.kl_div(y_proba[y_true == 1],
                                           y_proba[y_true == 0])
            kl_div_score += entropyc.kl_div(y_proba[y_true == 0],
                                            y_proba[y_true == 1])
        except:
            kl_div_score = np.nan
        scores = pd.Series({
            'model':
            clf,
            'roc_auc_score':
            metrics.roc_auc_score(y_true, y_proba),
            'good_rate':
            y_true.value_counts()[0] / len(y_true),
            'matthews_corrcoef':
            metrics.matthews_corrcoef(y_true, y_pred),
            'accuracy_score':
            metrics.accuracy_score(y_true, y_pred),
            'ks_score':
            np.nan,
            'precision_score':
            metrics.precision_score(y_true, y_pred),
            'recall_score':
            metrics.recall_score(y_true, y_pred),
            'kl_div':
            kl_div_score,
            'f1_score':
            metrics.f1_score(y_true, y_pred)
        })
        models_report = models_report.append(scores, ignore_index=True)
        conf_matrix[clf] = pd.crosstab(y_true,
                                       y_pred,
                                       rownames=['True'],
                                       colnames=['Predicted'],
                                       margins=False)
        #print('\n{} 模型的混淆矩阵:'.format(clf))
        #print(conf_matrix[clf])

        # ROC 曲线
        fpr, tpr, thresholds = metrics.roc_curve(y_true, y_proba, pos_label=1)
        auc_score = metrics.auc(fpr, tpr)
        w = tpr - fpr
        ks_score = w.max()
        models_report.loc[models_report['model'] == clf, 'ks_score'] = ks_score
        ks_x = fpr[w.argmax()]
        ks_y = tpr[w.argmax()]
        #sc=thresholds[w.argmax()]
        #fig1,ax1=plt.subplots()
        ax1.set_title('ROC Curve')
        ax1.set_xlabel('False Positive Rate')
        ax1.set_ylabel('True Positive Rate')
        ax1.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6))
        ax1.plot([ks_x, ks_x], [ks_x, ks_y], '--', color='red')
        ax1.text(
            ks_x, (ks_x + ks_y) / 2,
            r'   $S_c$=%.2f, KS=%.3f' % (thresholds[w.argmax()], ks_score))
        ax1.plot(fpr, tpr, label='{}:AUC={:.5f}'.format(clf, auc_score))
        ax1.legend()
        # PR 曲线
        precision, recall, thresholds = metrics.precision_recall_curve(
            y_true, y_proba, pos_label=1)
        #fig2,ax2=plt.subplots()
        ax2.plot(recall, precision, label=clf)
        ax2.set_title('P-R Curve')
        ax2.set_xlabel('Recall')
        ax2.set_ylabel('Precision')
        ax2.legend()
        #fig2.show()
        #密度函数和KL距离
        #fig3,ax3=plt.subplots()
        sns.kdeplot(y_proba[y_true == 0],
                    ax=ax3,
                    shade=True,
                    label='{}-0'.format(clf))
        sns.kdeplot(y_proba[y_true == 1],
                    ax=ax3,
                    shade=True,
                    label='{}-1'.format(clf))
        ax3.set_title('Density Curve')
        ax3.legend()
        ax3.autoscale()
        #fig3.show()

    if img_save:
        fig1.savefig('roc_curve_{}.png'.format(
            time.strftime('%Y%m%d%H%M', time.localtime())),
                     dpi=400)
        fig2.savefig('pr_curve_{}.png'.format(
            time.strftime('%Y%m%d%H%M', time.localtime())),
                     dpi=400)
        fig3.savefig('density_curve_{}.png'.format(
            time.strftime('%Y%m%d%H%M', time.localtime())),
                     dpi=400)
    else:
        fig1.show()
        fig2.show()
        fig3.show()
    models_report = models_report.set_index('model')
    #print('模型的性能评估:')
    #print(models_report)
    return models_report, conf_matrix
예제 #59
0
        dst_img = warp_and_crop_face(raw,
                                     facial5points,
                                     reference_pts=reference_5pts,
                                     crop_size=(crop_size, crop_size))

        # (3) Convert image data to keras format
        img_data = dst_img[np.newaxis, :]
        img_data = preprocess_input(img_data)
        data_results = model.predict(img_data)

        # (4) Predict gender and other attributes
        # gender_class = np.argmax(data_results[0]) # softmax
        gender_class = 1 if data_results[0] > 0.5 else 0  # sigmoid
        pred_labels.append(gender_class)

    # Calculate F1 score
    with open("labels.txt", 'r') as f:
        for line in f.readlines():
            gt_labels.append(int(line.strip().split()[1]))
    f1_score = f1_score(gt_labels, pred_labels, average='weighted')
    print("F1 score: {}".format(f1_score))

    # Correlation with Face++
    for json_path in tqdm.tqdm(sorted(paths.list_files("./test_images_json"))):
        face = LabelExtrator.FaceLabels(json_path).getFace(0)
        gender = np.argmax(face.Gender)
        gt_labels.append(gender)

    corr = matthews_corrcoef(gt_labels, pred_labels)
    print("Correction: {}".format(corr))
예제 #60
0
    def eval(self, splt):
        """
        Evaluate on XNLI validation and test sets, for all languages.
        """
        params = self.params
        self.embedder.eval()
        self.proj.eval()
        logger.info('Set embedder and proj layer to eval mode.')

        assert splt in ['valid', 'test']
        has_labels = 'y' in self.data[splt]

        scores = OrderedDict({'epoch': self.epoch})
        task = self.task.lower()

        idxs = []  # sentence indices
        prob = []  # probabilities
        pred = []  # predicted values
        gold = []  # real values

        # lang_id = params.lang2id['en']
        lang_id = params.lang2id['fr']

        batch_idx = 0
        for batch in self.get_iterator(splt):

            # batch
            if self.n_sent == 1:
                (x, lengths), idx = batch
                x, lengths = truncate(x, lengths, params.max_len,
                                      params.eos_index)
                # logger.info('x.size={}, lengths.size={}'.format(x.size(), lengths.size()))
            else:
                (sent1, len1), (sent2, len2), idx = batch
                sent1, len1 = truncate(sent1, len1, params.max_len,
                                       params.eos_index)
                sent2, len2 = truncate(sent2, len2, params.max_len,
                                       params.eos_index)
                x, lengths, _, _ = concat_batches(sent1,
                                                  len1,
                                                  lang_id,
                                                  sent2,
                                                  len2,
                                                  lang_id,
                                                  params.pad_index,
                                                  params.eos_index,
                                                  reset_positions=False)
                # logger.info('n_sent != 1 - x.size={}, lengths.size={}'.format(x.size(), lengths.size()))

            y = self.data[splt]['y'][idx] if has_labels else None
            # logger.info('y.size={}'.format(y.size()))

            # cuda
            x, y, lengths = to_cuda(x, y, lengths)

            # prediction
            output = self.proj(
                self.embedder.get_embeddings(x,
                                             lengths,
                                             positions=None,
                                             langs=None))
            p = output.data.max(1)[1] if self.is_classif else output.squeeze(1)

            idxs.append(idx)
            prob.append(output.cpu().numpy())
            pred.append(p.cpu().numpy())
            if has_labels:
                gold.append(y.cpu().numpy())

            if batch_idx % 20 == 0:
                logger.info('Evaluating batch idx = {}'.format(batch_idx))
            batch_idx += 1

        # indices / predictions
        idxs = np.concatenate(idxs)
        prob = np.concatenate(prob)
        pred = np.concatenate(pred)
        assert len(idxs) == len(pred), (len(idxs), len(pred))
        assert idxs[-1] == len(idxs) - 1, (idxs[-1], len(idxs) - 1)

        # score the predictions if we have labels
        if has_labels:
            gold = np.concatenate(gold)
            # prefix = f'{splt}_{task}'
            prefix = '{}_{}'.format(splt, task)
            if self.is_classif:
                scores['%s_acc' %
                       prefix] = 100. * (pred == gold).sum() / len(pred)
                scores['%s_f1' % prefix] = 100. * f1_score(
                    gold,
                    pred,
                    average='binary' if params.out_features == 2 else 'micro')
                scores['%s_mc' % prefix] = 100. * matthews_corrcoef(gold, pred)
            else:
                scores['%s_prs' % prefix] = 100. * pearsonr(pred, gold)[0]
                scores['%s_spr' % prefix] = 100. * spearmanr(pred, gold)[0]
            logger.info("__log__:%s" % json.dumps(scores))

        # output predictions
        # pred_path = os.path.join(params.dump_path, f'{splt}.pred.{self.epoch}')
        pred_path = os.path.join(params.dump_path,
                                 '{}.pred.{}'.format(splt, self.epoch))
        with open(pred_path, 'w') as f:
            for i, p in zip(idxs, prob):
                f.write('%i\t%s\n' % (i, ','.join([str(x) for x in p])))
        # logger.info(f"Wrote {len(idxs)} {splt} predictions to {pred_path}")
        logger.info("Wrote {} {} predictions to {}".format(
            len(idxs), splt, pred_path))

        return scores