def evaluate(norm, ignored_feats=set()):
    """
    Evaluates the saved model against the eval file
    :param norm:
    :param ignored_feats:
    :return:
    """
    global log, eval_file, model_file, scores_file, meta_dict

    log.info("Loading model from file")
    logistic = cPickle.load(open(model_file, 'r'))

    log.info("Loading eval data")
    x_eval, y_eval, ids_eval = \
        data_util.load_very_sparse_feats(eval_file, meta_dict,
                                         ignored_feats)
    if norm is not None:
        normalize(x_eval, norm=norm, copy=False)
    #endif

    log.info("Evaluating")
    y_pred_probs = logistic.predict_log_proba(x_eval)

    # Though we don't evaluate against them here, we want
    # to store scores for both ij and ji pairs
    pred_scores = dict()
    for i in range(len(ids_eval)):
        pred_scores[ids_eval[i]] = y_pred_probs[i]
    pred_scores = induce_ji_predictions(pred_scores)

    # We evaluate here only on ij pairs, but since this script
    # is not our final evaluation (and because the score should be
    # identical anyway) that's fine; this is functionally an estimate
    # of the true score
    scores = ScoreDict()
    for i in range(len(y_eval)):
        scores.increment(y_eval[i], np.argmax(y_pred_probs[i]))

    log.info("---Confusion matrix---")
    scores.print_confusion()

    log.info("---Scores---")
    for label in scores.keys:
        print str(label) + "\t" + scores.get_score(label).to_string() + " - %d (%.2f%%)" % \
              (scores.get_gold_count(label), scores.get_gold_percent(label))
    print "Acc: " + str(scores.get_accuracy()) + "%"

    if scores_file is not None:
        log.info("Writing probabilities to " + scores_file)
        with open(scores_file, 'w') as f:
            for id in pred_scores.keys():
                line = list()
                line.append(id)
                for j in range(len(pred_scores[id])):
                    line.append(str(pred_scores[id][j]))
                f.write(','.join(line) + '\n')
            #endfor
            f.close()
Пример #2
0
def train(model,
          balance,
          max_iter=None,
          max_depth=None,
          num_estimators=None,
          warm_start=None,
          ignored_feats=set()):
    """
    Trains the model
    :param model:
    :param balance:
    :param max_iter:
    :param max_depth:
    :param num_estimators:
    :param warm_start:
    :param ignored_feats:
    :return:
    """
    global log, train_file, meta_dict, model_file

    log.tic('info', "Loading training data")
    x, y, ids = \
        data_util.load_very_sparse_feats(train_file,
                                         meta_dict,
                                         ignored_feats)

    log.toc('info')

    log.tic('info', "Training")
    class_weight = None
    if balance:
        class_weight = 'balanced'
    #endif
    learner = None
    if model == 'svm':
        learner = SVC(probability=True,
                      class_weight=class_weight,
                      max_iter=max_iter)
    elif model == 'logistic':
        learner = LogisticRegression(class_weight=class_weight,
                                     max_iter=max_iter,
                                     n_jobs=-1)
    elif model == "decision_tree":
        learner = DecisionTreeClassifier(max_depth=max_depth,
                                         class_weight=class_weight)
    elif model == 'random_forest':
        learner = RandomForestClassifier(n_estimators=num_estimators,
                                         max_depth=max_depth,
                                         n_jobs=-1,
                                         warm_start=warm_start,
                                         class_weight=class_weight)
    #endif
    learner.fit(x, y)
    log.toc('info')

    log.info("Saving model")
    with open(model_file, 'wb') as pickle_file:
        cPickle.dump(learner, pickle_file)
def evaluate(ignored_feats=set()):
    """
    Evaluates the model, optionally ignoring features and saving
    predicted class scores
    :param ignored_feats:
    :return:
    """
    global log, eval_file, model_file, scores_file

    log.info("Loading model from file")
    learner = cPickle.load(open(model_file, 'r'))

    log.info("Loading eval data")
    x_eval, y_eval, ids_eval = \
        data_util.load_very_sparse_feats(eval_file, meta_dict,
                                         ignored_feats)

    log.info("Evaluating")
    y_pred_probs = learner.predict_log_proba(x_eval)
    scores = ScoreDict()
    for i in range(len(y_eval)):
        y_pred = 0
        max_prob = -float('inf')
        for j in range(len(y_pred_probs[i])):
            if y_pred_probs[i][j] > max_prob:
                max_prob = y_pred_probs[i][j]
                y_pred = j
            #endif
        #endfor

        scores.increment(y_eval[i], y_pred)
    #endfor

    log.info("---Confusion matrix---")
    scores.print_confusion()

    log.info("---Scores---")
    for label in scores.keys:
        print str(label) + "\t" + scores.get_score(label).to_latex_string() + " & %.2f\\%%\\\\" % \
                                                                            (scores.get_gold_percent(label))
    kurtoses = list()
    for log_proba in y_pred_probs:
        kurtoses.append(
            stats.kurtosis(log_proba, axis=0, fisher=True, bias=True))
    log.info(None, "Accuracy: %.2f%%; Kurtoses: %.2f", scores.get_accuracy(),
             sum(kurtoses) / len(kurtoses))

    log.info("Writing probabilities to file")
    with open(scores_file, 'w') as f:
        for i in range(len(ids_eval)):
            line = list()
            line.append(ids_eval[i])
            for j in range(len(y_pred_probs[i])):
                line.append(str(y_pred_probs[i][j]))
            f.write(','.join(line) + '\n')
def train(solver,
          max_iter,
          balance,
          norm,
          warm_start,
          multiclass_mode,
          ignored_feats=set()):
    """
    Trains the relation model as a multinomial logistic regression model
    :param solver:
    :param max_iter:
    :param balance:
    :param norm:
    :param warm_start:
    :param multiclass_mode:
    :param ignored_feats:
    :return:
    """
    global log, meta_dict, train_file, model_file

    log.tic('info', "Loading training data")
    x, y, ids = \
        data_util.load_very_sparse_feats(train_file, meta_dict,
                                         ignored_feats)
    if norm is not None:
        normalize(x, norm=norm, copy=False)
    log.toc('info')

    log.tic('info', "Training")
    class_weight = None
    if balance:
        class_weight = 'balanced'
    #endif
    logistic = LogisticRegression(class_weight=class_weight,
                                  solver=solver,
                                  max_iter=max_iter,
                                  multi_class=multiclass_mode,
                                  n_jobs=-1,
                                  warm_start=warm_start)
    logistic.fit(x, y)
    log.toc('info')

    log.info("Saving model")
    with open(model_file, 'wb') as pickle_file:
        cPickle.dump(logistic, pickle_file)
def train(max_iter, balance=False, warm_start=None, ignored_feats=set()):
    """
    Trains the cardinality classifier as a multinomial logistic regression
    model with max_iter iterations; optional parameters enable balanced class
    weights, warm start, and the ability to ignore features
    :param max_iter:
    :param balance:
    :param warm_start:
    :param ignored_feats:
    :return:
    """
    global log, train_file, meta_dict, model_file

    log.tic('info', "Loading training data")
    x, y, ids = \
        data_util.load_very_sparse_feats(train_file,
                                         meta_dict,
                                         ignored_feats)
    log.toc('info')

    log.tic('info', "Training")
    class_weight = None
    if balance:
        class_weight = 'balanced'
    #endif

    learner = LogisticRegression(class_weight=class_weight,
                                 solver='lbfgs',
                                 max_iter=max_iter,
                                 multi_class='multinomial',
                                 n_jobs=-1,
                                 warm_start=warm_start)
    #learner = mord.OrdinalRidge(max_iter=max_iter)

    learner.fit(x, y)
    log.toc('info')

    log.info("Saving model")
    with open(model_file, 'wb') as pickle_file:
        cPickle.dump(learner, pickle_file)
Пример #6
0
def evaluate(lemma_file=None,
             hyp_file=None,
             ignored_feats=set(),
             save_scores=True):
    """
    Evaluates the model against the eval data
    :param lemma_file:
    :param hyp_file:
    :param ignored_feats:
    :param save_scores:
    :return:
    """
    global log, eval_file, model_file, scores_file

    log.info("Loading model from file")
    learner = cPickle.load(open(model_file, 'r'))

    log.info("Loading eval data")
    x_eval, y_eval, ids_eval = \
        data_util.load_very_sparse_feats(eval_file, meta_dict,
                                         ignored_feats)

    lemma_dict = dict()
    lemmas = set()
    if lemma_file is not None:
        log.info("Loading mention lemmas")
        with open(lemma_file, 'r') as f:
            for line in f:
                parts = line.replace('"', '').strip().split(",")
                lemma_dict[parts[0]] = parts[1]
                lemmas.add(parts[1])
            #endfor
            f.close()
        #endwith
    #endif

    hypernyms = set()
    if hyp_file is not None:
        log.info("Loading mention hypernyms")
        with open(hyp_file, 'r') as f:
            id_hyp_dict = json.load(f)
        for hyps in id_hyp_dict.values():
            if isinstance(hyps, list):
                for h in hyps:
                    hypernyms.add(h)
            else:
                hypernyms.add(hyps)
            #endif
        #endfor
    #endif

    log.info("Evaluating")
    lemma_scores = dict()
    for l in lemmas:
        lemma_scores[l] = ScoreDict()
    hyp_scores = dict()
    for h in hypernyms:
        hyp_scores[h] = ScoreDict()

    y_pred_eval = learner.predict_proba(x_eval)
    scores = ScoreDict()
    pred_scores = dict()
    for idx in range(len(y_pred_eval)):
        id = ids_eval[idx]
        pred_scores[id] = y_pred_eval[idx]

        pred = 0 if pred_scores[id][0] > pred_scores[id][1] else 1
        scores.increment(y_eval[idx], pred)
    #endfor

    log.info("---Confusion matrix---")
    scores.print_confusion()

    log.info("---Scores---")
    for label in scores.keys:
        print str(label) + "\t" + scores.get_score(label).to_string() + " - %d (%.2f%%)" % \
              (scores.get_gold_count(label), scores.get_gold_percent(label))
    log.info(None, "Accuracy: %.2f%%", scores.get_accuracy())

    if save_scores:
        log.info("Writing scores to " + scores_file)
        with open(scores_file, 'w') as f:
            for id in pred_scores.keys():
                score_line = list()
                score_line.append(id)
                for s in pred_scores[id]:
                    score = s
                    if score == 0:
                        score = np.nextafter(0, 1)
                    score = str(np.log(score))
                    score_line.append(score)
                #endfor
                f.write(','.join(score_line) + '\n')
            f.close()