예제 #1
0
def run_baseline(article_count, split=0.9):
    articles, total_token_count = preprocess_wsj(article_count, [])
    train, test = bifurcate(articles, split, shuffle=True)

    counts = dict(DEF=0, INDEF=0)
    for article in train:
        for sentence in article:
            for def_tag in sentence.def_tags:
                # only use def / indef tokens
                if def_tag in ('DEF', 'INDEF'):
                    counts[def_tag] += 1

    mle = 'DEF' if counts['DEF'] > counts['INDEF'] else 'INDEF'

    test_det_def_tags = []
    for article in test:
        for sentence in article:
            for def_tag in sentence.def_tags:
                if def_tag in ('DEF', 'INDEF'):
                    test_det_def_tags.append(def_tag)

    correct, wrong = matches([mle]*len(test_det_def_tags), test_det_def_tags)

    return dict(
        total_articles_count=len(articles),  # int
        total_token_count=total_token_count,  # int
        train_count=len(train),  # int
        test_count=len(test),  # int
        kernel='MLE=%s' % mle,
        correct=correct,
        wrong=wrong,
        total=correct + wrong,
    )
예제 #2
0
def run_baseline(article_count, split=0.9):
    articles, total_token_count = preprocess_wsj(article_count, [])
    train, test = bifurcate(articles, split, shuffle=True)

    counts = dict(DEF=0, INDEF=0)
    for article in train:
        for sentence in article:
            for def_tag in sentence.def_tags:
                # only use def / indef tokens
                if def_tag in ('DEF', 'INDEF'):
                    counts[def_tag] += 1

    mle = 'DEF' if counts['DEF'] > counts['INDEF'] else 'INDEF'

    test_det_def_tags = []
    for article in test:
        for sentence in article:
            for def_tag in sentence.def_tags:
                if def_tag in ('DEF', 'INDEF'):
                    test_det_def_tags.append(def_tag)

    correct, wrong = matches([mle] * len(test_det_def_tags), test_det_def_tags)

    return dict(
        total_articles_count=len(articles),  # int
        total_token_count=total_token_count,  # int
        train_count=len(train),  # int
        test_count=len(test),  # int
        kernel='MLE=%s' % mle,
        correct=correct,
        wrong=wrong,
        total=correct + wrong,
    )
예제 #3
0
def run_svm(article_count, feature_functions, kernel='polynomial', split=0.9, model_path='svm.model'):
    # https://bitbucket.org/wcauchois/pysvmlight
    articles, total_token_count = preprocess_wsj(article_count, feature_functions)

    dictionary = Dictionary()
    dictionary.add_one('ZZZZZ')  # so that no features are labeled 0
    data = []
    for article in articles:
        for sentence in article:
            for tag, token_features in zip(sentence.def_tags, sentence.data):
                # only use def / indef tokens
                if tag in ('DEF', 'INDEF'):
                    features = dictionary.add(token_features)
                    features = sorted(list(set(features)))
                    feature_values = zip(features, [1]*len(features))
                    data.append((+1 if tag == 'DEF' else -1, feature_values))

    train, test = bifurcate(data, split, shuffle=True)

    # for corpus, name in [(train, 'train'), (test, 'test')]:
        # write_svm(corpus, 'wsj_svm-%s.data' % name)

    #####################
    # do svm in Python...
    model = svmlight.learn(train, type='classification', kernel=kernel)

    # svmlight.learn options
    # type: select between 'classification', 'regression', 'ranking' (preference ranking), and 'optimization'.
    # kernel: select between 'linear', 'polynomial', 'rbf', and 'sigmoid'.
    # verbosity: set the verbosity level (default 0).
    # C: trade-off between training error and margin.
    # poly_degree: parameter d in polynomial kernel.
    # rbf_gamma: parameter gamma in rbf kernel.
    # coef_lin
    # coef_const
    # costratio (corresponds to -j option to svm_learn)
    svmlight.write_model(model, model_path)

    gold_labels, test_feature_values = zip(*test)
    # total = len(gold_labels)

    test_pairs = [(0, feature_values) for feature_values in test_feature_values]
    predictions = svmlight.classify(model, test_pairs)

    correct, wrong = matches(
        [(gold > 0) for gold in gold_labels],
        [(prediction > 0) for prediction in predictions])

    return dict(
        total_articles_count=len(articles),  # int
        total_token_count=total_token_count,  # int
        train_count=len(train),  # int
        test_count=len(test),  # int
        kernel=kernel,
        correct=correct,
        wrong=wrong,
        total=correct + wrong,
    )
예제 #4
0
def run_crf(article_count,
            feature_functions,
            split=0.9,
            model_path='crf.model'):
    articles, total_token_count = preprocess_wsj(article_count,
                                                 feature_functions)

    train, test = bifurcate(articles, split, shuffle=True)

    trainer = crf.Trainer()
    for article in train:
        for sentence in article:
            trainer.append_raw(sentence.data, sentence.def_tags)

    trainer.save(model_path)
    tagger = crf.Tagger(model_path)

    # results = defaultdict(list)
    # confusion_matrix = defaultdict(int)
    correct = 0
    wrong = 0
    for article in test:
        for sentence in article:
            gold_labels = sentence.def_tags
            predicted_labels = tagger.tag_raw(sentence.data)
            # for token, gold, predicted in zip(sentence.tokens, sentence.def_tags, predicted_tags):
            for gold, predicted in zip(gold_labels, predicted_labels):
                # key = (gold, predicted)
                # results[key] += [token]
                if gold in ('DEF', 'INDEF'):
                    if gold == predicted:
                        correct += 1
                    else:
                        wrong += 1

    # print 'Results'
    # for (predicted_label, gold_label), tokens in results.items():
    # color = 'green' if predicted_label == gold_label else 'red'
    # cprint('%5d predicted=%s -> gold=%s' % (len(tokens), predicted_label, gold_label), color)
    # print '  ', Counter(tokens).most_common(20)

    return dict(
        total_articles_count=len(articles),  # int
        total_token_count=total_token_count,  # int
        train_count=len(train),  # int
        test_count=len(test),  # int
        kernel='CRF',
        correct=correct,
        wrong=wrong,
        total=correct + wrong)
예제 #5
0
def run_crf(article_count, feature_functions, split=0.9, model_path='crf.model'):
    articles, total_token_count = preprocess_wsj(article_count, feature_functions)

    train, test = bifurcate(articles, split, shuffle=True)

    trainer = crf.Trainer()
    for article in train:
        for sentence in article:
            trainer.append_raw(sentence.data, sentence.def_tags)

    trainer.save(model_path)
    tagger = crf.Tagger(model_path)

    # results = defaultdict(list)
    # confusion_matrix = defaultdict(int)
    correct = 0
    wrong = 0
    for article in test:
        for sentence in article:
            gold_labels = sentence.def_tags
            predicted_labels = tagger.tag_raw(sentence.data)
            # for token, gold, predicted in zip(sentence.tokens, sentence.def_tags, predicted_tags):
            for gold, predicted in zip(gold_labels, predicted_labels):
                # key = (gold, predicted)
                # results[key] += [token]
                if gold in ('DEF', 'INDEF'):
                    if gold == predicted:
                        correct += 1
                    else:
                        wrong += 1

    # print 'Results'
    # for (predicted_label, gold_label), tokens in results.items():
        # color = 'green' if predicted_label == gold_label else 'red'
        # cprint('%5d predicted=%s -> gold=%s' % (len(tokens), predicted_label, gold_label), color)
        # print '  ', Counter(tokens).most_common(20)

    return dict(
        total_articles_count=len(articles),  # int
        total_token_count=total_token_count,  # int
        train_count=len(train),  # int
        test_count=len(test),  # int
        kernel='CRF',
        correct=correct,
        wrong=wrong,
        total=correct + wrong
    )
예제 #6
0
def do_hmm(documents, split):
    train, test = bifurcate(documents, split)

    # train does NOT accept generators
    tagger = HiddenMarkovModelTagger.train([doc.token_label_pairs() for doc in train])
    results = defaultdict(list)
    for doc in test:
        predicted = tagger.tag(doc.tokens)
        gold_labels = doc.labels
        # precision =
        # recall =

        token_tag_pairs = nltk.pos_tag(doc.literal)
        print '\n-----------\n' + gloss(token_tag_pairs)

        for (token, predicted_label), gold_label in zip(predicted, gold_labels):
            results[(predicted_label, gold_label)] += [token]
예제 #7
0
def do_hmm(documents, split):
    train, test = bifurcate(documents, split)

    # train does NOT accept generators
    tagger = HiddenMarkovModelTagger.train(
        [doc.token_label_pairs() for doc in train])
    results = defaultdict(list)
    for doc in test:
        predicted = tagger.tag(doc.tokens)
        gold_labels = doc.labels
        # precision =
        # recall =

        token_tag_pairs = nltk.pos_tag(doc.literal)
        print '\n-----------\n' + gloss(token_tag_pairs)

        for (token,
             predicted_label), gold_label in zip(predicted, gold_labels):
            results[(predicted_label, gold_label)] += [token]
예제 #8
0
def run_svm(article_count,
            feature_functions,
            kernel='polynomial',
            split=0.9,
            model_path='svm.model'):
    # https://bitbucket.org/wcauchois/pysvmlight
    articles, total_token_count = preprocess_wsj(article_count,
                                                 feature_functions)

    dictionary = Dictionary()
    dictionary.add_one('ZZZZZ')  # so that no features are labeled 0
    data = []
    for article in articles:
        for sentence in article:
            for tag, token_features in zip(sentence.def_tags, sentence.data):
                # only use def / indef tokens
                if tag in ('DEF', 'INDEF'):
                    features = dictionary.add(token_features)
                    features = sorted(list(set(features)))
                    feature_values = zip(features, [1] * len(features))
                    data.append((+1 if tag == 'DEF' else -1, feature_values))

    train, test = bifurcate(data, split, shuffle=True)

    # for corpus, name in [(train, 'train'), (test, 'test')]:
    # write_svm(corpus, 'wsj_svm-%s.data' % name)

    #####################
    # do svm in Python...
    model = svmlight.learn(train, type='classification', kernel=kernel)

    # svmlight.learn options
    # type: select between 'classification', 'regression', 'ranking' (preference ranking), and 'optimization'.
    # kernel: select between 'linear', 'polynomial', 'rbf', and 'sigmoid'.
    # verbosity: set the verbosity level (default 0).
    # C: trade-off between training error and margin.
    # poly_degree: parameter d in polynomial kernel.
    # rbf_gamma: parameter gamma in rbf kernel.
    # coef_lin
    # coef_const
    # costratio (corresponds to -j option to svm_learn)
    svmlight.write_model(model, model_path)

    gold_labels, test_feature_values = zip(*test)
    # total = len(gold_labels)

    test_pairs = [(0, feature_values)
                  for feature_values in test_feature_values]
    predictions = svmlight.classify(model, test_pairs)

    correct, wrong = matches([(gold > 0) for gold in gold_labels],
                             [(prediction > 0) for prediction in predictions])

    return dict(
        total_articles_count=len(articles),  # int
        total_token_count=total_token_count,  # int
        train_count=len(train),  # int
        test_count=len(test),  # int
        kernel=kernel,
        correct=correct,
        wrong=wrong,
        total=correct + wrong,
    )