示例#1
0
def learning_curve_avg(classifiers, datasets, outdir, pickle_name='learning'):

    with open(_get_learning_pickle_path(outdir, name=pickle_name),
              'r') as results_file:
        results = pickle_load(results_file)

    for dataset in datasets:
        print 'Dataset:', dataset
        for classifier in classifiers:
            print 'Classifier:', classifier
            macro_avg = mean([
                res_tup[0]
                for res_tup in results[dataset][classifier].itervalues()
            ]) * 100
            macro_tip = sorted((size, res_tup[0])
                               for size, res_tup in results[dataset]
                               [classifier].iteritems())[-1][1] * 100
            amb_avg = mean([
                res_tup[11]
                for res_tup in results[dataset][classifier].itervalues()
            ])
            amb_tip = sorted((size, res_tup[11]) for size, res_tup in
                             results[dataset][classifier].iteritems())[-1][1]
            rec_avg = mean([
                res_tup[13]
                for res_tup in results[dataset][classifier].itervalues()
            ]) * 100
            rec_tip = sorted((size, res_tup[13])
                             for size, res_tup in results[dataset]
                             [classifier].iteritems())[-1][1] * 100

            print(
                '{:.2f}/{:.2f}/{:.2f}/{:.2f}/{:.2f}/{:.2f} '
                'MACROAVG/MACROTIP/AMBAVG/AMBTIP/RECAVG/RECTIP').format(
                    macro_avg, macro_tip, amb_avg, amb_tip, rec_avg, rec_tip)
示例#2
0
def learning_curve_avg(classifiers, datasets, outdir, pickle_name='learning'):

    with open(_get_learning_pickle_path(outdir, name=pickle_name), 'r') as results_file:
        results = pickle_load(results_file)

    for dataset in datasets:
        print 'Dataset:', dataset
        for classifier in classifiers:
            print 'Classifier:', classifier
            macro_avg = mean([res_tup[0] for res_tup
                in results[dataset][classifier].itervalues()]) * 100
            macro_tip = sorted((size, res_tup[0]) for size, res_tup
                    in results[dataset][classifier].iteritems())[-1][1] * 100
            amb_avg = mean([res_tup[11] for res_tup
                in results[dataset][classifier].itervalues()])
            amb_tip = sorted((size, res_tup[11]) for size, res_tup
                    in results[dataset][classifier].iteritems())[-1][1]
            rec_avg = mean([res_tup[13] for res_tup
                in results[dataset][classifier].itervalues()]) * 100
            rec_tip = sorted((size, res_tup[13]) for size, res_tup
                    in results[dataset][classifier].iteritems())[-1][1] * 100

            print ('{:.2f}/{:.2f}/{:.2f}/{:.2f}/{:.2f}/{:.2f} '
                    'MACROAVG/MACROTIP/AMBAVG/AMBTIP/RECAVG/RECTIP').format(
                    macro_avg, macro_tip, amb_avg, amb_tip, rec_avg, rec_tip)
示例#3
0
def score_classifier_by_tup(classifier, test_tups):
    # (TP, FP, FN) # Leaving out TN
    results_by_class = {} #XXX: THIS HAS TO BE A CLASS!

    for test_lbl, test_vec in izip(*test_tups):
        if not isinstance(test_lbl, str):
            test_lbl_type = classifier.name_by_lbl_id[test_lbl]
        else:
            test_lbl_type = test_lbl

        # TODO: Cast annotation into span! It needs to be censored
        predicted = classifier._classify(test_vec)
        
        try:
            results_by_class[test_lbl_type]
        except KeyError:
            results_by_class[test_lbl_type] = (0, 0, 0)

        try:
            results_by_class[predicted]
        except KeyError:
            results_by_class[predicted] = (0, 0, 0)

        a_tp, a_fp, a_fn = results_by_class[test_lbl_type]
        p_tp, p_fp, p_fn = results_by_class[predicted]

        if predicted == test_lbl_type:
            results_by_class[test_lbl_type] = (a_tp + 1, a_fp, a_fn)
        if predicted != test_lbl_type:
            results_by_class[test_lbl_type] = (a_tp, a_fp, a_fn + 1)
            results_by_class[predicted] =  (p_tp, p_fp + 1, p_fn)

    # Extend the results to:
    # macro, micro, {RESULTS_BY_CLASS}
    tp_sum = sum([tp for tp, _, _ in results_by_class.itervalues()])
    fn_sum = sum([fn for _, _, fn in results_by_class.itervalues()])
    macro_score = tp_sum / float(tp_sum + fn_sum)
    
    micro_scores = []
    for res_tup in results_by_class.itervalues():
        m_tp, _, m_fn = res_tup
        m_tot = float(m_tp + m_fn)
        if m_tot <= 0:
            micro_scores.append(1.0)
        else:
            micro_scores.append(m_tp / float(m_tp + m_fn))
    micro_score = mean(micro_scores)

    return (macro_score, micro_score, tp_sum, fn_sum, results_by_class)
示例#4
0
def score_classifier(classifier, test_set):
    # (TP, FP, FN) # Leaving out TN
    results_by_class = {} #XXX: THIS HAS TO BE A CLASS!

    for document in test_set:
        for sentence in document:
            for annotation in sentence:
                # TODO: Cast annotation into span! It needs to be censored
                predicted = classifier.classify(document, sentence, annotation)
                
                try:
                    results_by_class[annotation.type]
                except KeyError:
                    results_by_class[annotation.type] = (0, 0, 0)

                try:
                    results_by_class[predicted]
                except KeyError:
                    results_by_class[predicted] = (0, 0, 0)

                a_tp, a_fp, a_fn = results_by_class[annotation.type]
                p_tp, p_fp, p_fn = results_by_class[predicted]

                if predicted == annotation.type:
                    results_by_class[annotation.type] = (a_tp + 1, a_fp, a_fn)
                if predicted != annotation.type:
                    results_by_class[annotation.type] = (a_tp, a_fp, a_fn + 1)
                    results_by_class[predicted] =  (p_tp, p_fp + 1, p_fn)

    # Extend the results to:
    # macro, micro, {RESULTS_BY_CLASS}
    tp_sum = sum([tp for tp, _, _ in results_by_class.itervalues()])
    fn_sum = sum([fn for _, _, fn in results_by_class.itervalues()])
    macro_score = tp_sum / float(tp_sum + fn_sum)
    
    micro_scores = []
    for res_tup in results_by_class.itervalues():
        m_tp, _, m_fn = res_tup
        m_tot = float(m_tp + m_fn)
        if m_tot <= 0:
            micro_scores.append(1.0)
        else:
            micro_scores.append(m_tp / float(m_tp + m_fn))
    micro_score = mean(micro_scores)

    return (macro_score, micro_score, tp_sum, fn_sum, results_by_class)
示例#5
0
def manual_describe(df: pd.DataFrame) -> pd.DataFrame:
    output_df = pd.DataFrame(
        columns=[columnName for (columnName, columnData) in df.iteritems()],
        index=['Count', 'Mean', 'Std', 'Min', '25%', '50%', '75%', 'Max'])
    for (columnName, columnData) in df.iteritems():
        if columnName in output_df.columns:
            my_values = [
                x for x in columnData.values[~np.isnan(columnData.values)]
            ]
            my_values.sort()
            count_val = count(my_values)
            mean_val = mean(my_values)
            std_val = standard_deviation(my_values)
            min_val = min_(my_values)
            quant_25_val = quantile(my_values, 0.25)
            quant_50_val = median(my_values)
            quant_75_val = quantile(my_values, 0.75)
            max_val = max_(my_values)
            output_df[columnName] = [
                count_val, mean_val, std_val, min_val, quant_25_val,
                quant_50_val, quant_75_val, max_val
            ]
    return output_df
示例#6
0
def _knockout_pass(f_id, classifier, train_data, folds, to_censor):
    macro_scores = []
    for fold_num, fold in enumerate(folds, start=1):
        train_set = train_data - fold
        test_set = fold

        assert len(train_set) + len(test_set) == len(train_data)

        train_vecs = [d for d in _censor_sparse_vectors_gen(
            (v for _, v in train_set), to_censor)]
        train_lbls = [l for l, _ in train_set]

        classifier._liblinear_train(train_lbls, train_vecs)

        test_vecs = [d for d in _censor_sparse_vectors_gen(
            (v for _, v in test_set), to_censor)]
        test_lbls = (l for l, _ in test_set)
        res_tup =score_classifier_by_tup(classifier, (test_lbls, test_vecs))
        macro_scores.append(res_tup[0])

    mean = mean(macro_scores)

    return f_id, mean
示例#7
0
top = input("Pick a top number? ")
guess_range = range(bottom, top+1)
ans = random.randint(bottom, top)
games = 0
average_guesses = []
again = 'y'

while again == 'y':
	ans = random.randint(bottom, top)
	games += 1
	print "Game %d: Number picked!..." % games
	guesses = 0
	guess = '' 
	while guess != ans:
		print "Guess #%d:" % (guesses+1)
		guess = input("> ")
		guesses += 1
		if guess > ans:
			print "Too high,"
		elif guess < ans:
			print "Too low,"
		else:
			pass

	if guess == ans:
		average_guesses.append(guesses)
		again = raw_input("Yes! Play again? (y/n)")
avg_guess = maths.mean(average_guesses)
print average_guesses
print "End of game, %d games played with an average of %f guesses." % (games, avg_guess)
示例#8
0
def _learning_curve_test_data_set(classifiers,
                                  train,
                                  test,
                                  worker_pool,
                                  verbose=False,
                                  no_simstring_cache=False,
                                  use_test_set=False,
                                  folds=10,
                                  min_perc=5,
                                  max_perc=100,
                                  step_perc=5,
                                  it_factor=1):

    # XXX: Not necessary any more!
    if verbose:
        print >> stderr, 'Calculating train set size...',
    train_size = 0
    for d in train:
        for s in d:
            for a in s:
                train_size += 1
    if verbose:
        print >> stderr, 'Done!'
    # XXX:

    if not no_simstring_cache:
        simstring_caching(classifiers, (train, test), verbose=verbose)

    # Collect the seen type to iterate over later
    seen_types = set()
    results_by_classifier = {}

    for classifier_id, classifier_class in classifiers.iteritems():
        if verbose:
            print >> stderr, 'Classifier:', classifier_id, '...',

        from classifier.liblinear import hashabledict

        classifier = classifier_class()
        if verbose:
            print >> stderr, 'featurising train:', '...',
        train_lbls, train_vecs = classifier._gen_lbls_vecs(train)
        train_set = [e for e in izip(train_lbls, train_vecs)]
        assert len(train_lbls) == train_size, '{} != {}'.format(
            len(train_lbls), train_size)
        assert len(train_vecs) == train_size, '{} != {}'.format(
            len(train_vecs), train_size)
        assert len(train_set) == train_size, '{} != {}'.format(
            len(train_set), train_size)
        del train_lbls
        del train_vecs
        if verbose:
            print >> stderr, 'Done!',
            print >> stderr, 'featurising test', '...',
        test_lbls, test_vecs = classifier._gen_lbls_vecs(test)
        test_vecs = [hashabledict(d) for d in test_vecs]
        if verbose:
            print >> stderr, 'Done!',

        # Fix the seed so that we get comparable folds
        seed(0xd5347d33)
        args = ((classifier, fold) for fold in _train_fold_gen(
            train_set, min_perc, max_perc, step_perc, it_factor))

        if worker_pool is None:
            res_it = (_train_fold(*arg) for arg in args)
        else:
            res_it = worker_pool.imap(__train_fold, args)

        classifier_results = defaultdict(list)

        print >> stderr, 'Training and evaluating models: ...',

        i = 0
        for sample_size, fold_classifier in res_it:
            score, new_score = _score_classifier(fold_classifier, test_lbls,
                                                 test_vecs)
            classifier_results[sample_size].append((score, new_score))
            i += 1
            if i % 10 == 0:
                print >> stderr, i, '...',
        print >> stderr, 'Done!'

        # Process the results
        for sample_size in sorted(e for e in classifier_results):
            results = classifier_results[sample_size]
            scores = [score for score, _ in results]
            new_scores = [new_score for _, new_score in results]

            macro_scores = [ms for ms, _, _, _, _ in scores]
            micro_scores = [ms for _, ms, _, _, _ in scores]
            tps = [tp for _, _, tp, _, _ in scores]
            fns = [fn for _, _, _, fn, _ in scores]
            res_dics = [d for _, _, _, _, d in scores]

            # New metrics
            ranks = [mean(rs) for rs, _, _ in new_scores]
            ambiguities = [mean(ambs) for _, ambs, _ in new_scores]
            recalls = [r for _, _, r in new_scores]

            # These are means of means
            ranks_mean = mean(ranks)
            ranks_stddev = stddev(ranks)
            ambiguities_mean = mean(ambiguities)
            ambiguities_stddev = stddev(ambiguities)
            recalls_mean = mean(recalls)
            recalls_stddev = stddev(recalls)

            classifier_result = (
                mean(macro_scores),
                stddev(macro_scores),
                mean(micro_scores),
                stddev(micro_scores),
                mean(tps),
                stddev(tps),
                mean(fns),
                stddev(fns),
                res_dics,
                # New metrics
                ranks_mean,
                ranks_stddev,
                ambiguities_mean,
                ambiguities_stddev,
                recalls_mean,
                recalls_stddev)

            classifier_results[sample_size] = classifier_result

            if verbose:
                res_str = (
                    'Results {size}: '
                    'MACRO: {0:.3f} MACRO_STDDEV: {1:.3f} '
                    'MICRO: {2:.3f} MICRO_STDDEV: {3:.3f} '
                    'TP: {4:.3f} FP: {5:.3f} '
                    'MEAN_RANK: {mean_rank:.3f} MEAN_RANK_STDDEV: {mean_rank_stddev:.3f} '
                    'AVG_AMB: {avg_amb:.3f} AVG_AMB_STDDEV: {avg_amb_stddev:.3f} '
                    'RECALL: {recall:.3f} RECALL_STDDEV: {recall_stddev:.3f}'
                ).format(*classifier_result,
                         size=sample_size,
                         mean_rank=ranks_mean,
                         mean_rank_stddev=ranks_stddev,
                         avg_amb=ambiguities_mean,
                         avg_amb_stddev=ambiguities_stddev,
                         recall=recalls_mean,
                         recall_stddev=recalls_stddev)
                print res_str

        results_by_classifier[classifier_id] = classifier_results
    return results_by_classifier
示例#9
0
def _learning_curve_test_data_set(classifiers, train, test,
        worker_pool, verbose=False, no_simstring_cache=False,
        use_test_set=False, folds=10, min_perc=5, max_perc=100, step_perc=5,
        it_factor=1):

    # XXX: Not necessary any more!
    if verbose:
        print >> stderr, 'Calculating train set size...',
    train_size = 0
    for d in train:
        for s in d:
            for a in s:
                train_size += 1
    if verbose:
        print >> stderr, 'Done!'
    # XXX:

    if not no_simstring_cache:
        simstring_caching(classifiers, (train, test), verbose=verbose)

    # Collect the seen type to iterate over later
    seen_types = set()
    results_by_classifier = {}

    for classifier_id, classifier_class in classifiers.iteritems():
        if verbose:
            print >> stderr, 'Classifier:', classifier_id, '...',

        from classifier.liblinear import hashabledict

        classifier = classifier_class()
        if verbose:
            print >> stderr, 'featurising train:', '...',
        train_lbls, train_vecs = classifier._gen_lbls_vecs(train)
        train_set = [e for e in izip(train_lbls, train_vecs)]
        assert len(train_lbls) == train_size, '{} != {}'.format(
                len(train_lbls), train_size)
        assert len(train_vecs) == train_size, '{} != {}'.format(
                len(train_vecs), train_size)
        assert len(train_set) == train_size, '{} != {}'.format(
                len(train_set), train_size)
        del train_lbls
        del train_vecs
        if verbose:
            print >> stderr, 'Done!',
            print >> stderr, 'featurising test', '...',
        test_lbls, test_vecs = classifier._gen_lbls_vecs(test)
        test_vecs = [hashabledict(d) for d in test_vecs]
        if verbose:
            print >> stderr, 'Done!',

        # Fix the seed so that we get comparable folds
        seed(0xd5347d33)
        args = ((classifier, fold) for fold in _train_fold_gen(train_set,
            min_perc, max_perc, step_perc, it_factor))

        if worker_pool is None:
            res_it = (_train_fold(*arg) for arg in args)
        else:
            res_it = worker_pool.imap(__train_fold, args)

        classifier_results = defaultdict(list)

        print >> stderr, 'Training and evaluating models: ...',

        i = 0
        for sample_size, fold_classifier in res_it:
            score, new_score = _score_classifier(fold_classifier, test_lbls,
                    test_vecs)
            classifier_results[sample_size].append((score, new_score))
            i += 1
            if i % 10 == 0:
                print >> stderr, i, '...',
        print >> stderr, 'Done!'

        # Process the results
        for sample_size in sorted(e for e in classifier_results):
            results = classifier_results[sample_size]
            scores = [score for score, _ in results]
            new_scores = [new_score for _, new_score in results]

            macro_scores = [ms for ms, _, _, _, _ in scores]
            micro_scores = [ms for _, ms, _, _, _ in scores]
            tps = [tp for _, _, tp, _, _ in scores]
            fns = [fn for _, _, _, fn, _ in scores]
            res_dics = [d for _, _, _, _, d in scores]

            # New metrics
            ranks = [mean(rs) for rs, _, _ in new_scores]
            ambiguities = [mean(ambs) for _, ambs, _ in new_scores]
            recalls = [r for  _, _, r in new_scores]

            # These are means of means
            ranks_mean = mean(ranks)
            ranks_stddev = stddev(ranks)
            ambiguities_mean = mean(ambiguities)
            ambiguities_stddev = stddev(ambiguities)
            recalls_mean = mean(recalls)
            recalls_stddev = stddev(recalls)

            classifier_result = (
                    mean(macro_scores), stddev(macro_scores),
                    mean(micro_scores), stddev(micro_scores),
                    mean(tps), stddev(tps),
                    mean(fns), stddev(fns),
                    res_dics,
                    # New metrics
                    ranks_mean, ranks_stddev,
                    ambiguities_mean, ambiguities_stddev,
                    recalls_mean, recalls_stddev
                    )


            classifier_results[sample_size] = classifier_result
            
            if verbose:
                res_str = ('Results {size}: '
                        'MACRO: {0:.3f} MACRO_STDDEV: {1:.3f} '
                        'MICRO: {2:.3f} MICRO_STDDEV: {3:.3f} '
                        'TP: {4:.3f} FP: {5:.3f} '
                        'MEAN_RANK: {mean_rank:.3f} MEAN_RANK_STDDEV: {mean_rank_stddev:.3f} '
                        'AVG_AMB: {avg_amb:.3f} AVG_AMB_STDDEV: {avg_amb_stddev:.3f} '
                        'RECALL: {recall:.3f} RECALL_STDDEV: {recall_stddev:.3f}'
                        ).format(*classifier_result,
                                size=sample_size,
                                mean_rank=ranks_mean,
                                mean_rank_stddev=ranks_stddev,
                                avg_amb=ambiguities_mean,
                                avg_amb_stddev=ambiguities_stddev,
                                recall=recalls_mean,
                                recall_stddev=recalls_stddev
                                )
                print res_str

        results_by_classifier[classifier_id] = classifier_results
    return results_by_classifier
示例#10
0
guess_range = range(bottom, top + 1)
ans = random.randint(bottom, top)
games = 0
average_guesses = []
again = 'y'

while again == 'y':
    ans = random.randint(bottom, top)
    games += 1
    print "Game %d: Number picked!..." % games
    guesses = 0
    guess = ''
    while guess != ans:
        print "Guess #%d:" % (guesses + 1)
        guess = input("> ")
        guesses += 1
        if guess > ans:
            print "Too high,"
        elif guess < ans:
            print "Too low,"
        else:
            pass

    if guess == ans:
        average_guesses.append(guesses)
        again = raw_input("Yes! Play again? (y/n)")
avg_guess = maths.mean(average_guesses)
print average_guesses
print "End of game, %d games played with an average of %f guesses." % (
    games, avg_guess)