def run_baseline(article_count, split=0.9): articles, total_token_count = preprocess_wsj(article_count, []) train, test = bifurcate(articles, split, shuffle=True) counts = dict(DEF=0, INDEF=0) for article in train: for sentence in article: for def_tag in sentence.def_tags: # only use def / indef tokens if def_tag in ('DEF', 'INDEF'): counts[def_tag] += 1 mle = 'DEF' if counts['DEF'] > counts['INDEF'] else 'INDEF' test_det_def_tags = [] for article in test: for sentence in article: for def_tag in sentence.def_tags: if def_tag in ('DEF', 'INDEF'): test_det_def_tags.append(def_tag) correct, wrong = matches([mle]*len(test_det_def_tags), test_det_def_tags) return dict( total_articles_count=len(articles), # int total_token_count=total_token_count, # int train_count=len(train), # int test_count=len(test), # int kernel='MLE=%s' % mle, correct=correct, wrong=wrong, total=correct + wrong, )
def run_baseline(article_count, split=0.9): articles, total_token_count = preprocess_wsj(article_count, []) train, test = bifurcate(articles, split, shuffle=True) counts = dict(DEF=0, INDEF=0) for article in train: for sentence in article: for def_tag in sentence.def_tags: # only use def / indef tokens if def_tag in ('DEF', 'INDEF'): counts[def_tag] += 1 mle = 'DEF' if counts['DEF'] > counts['INDEF'] else 'INDEF' test_det_def_tags = [] for article in test: for sentence in article: for def_tag in sentence.def_tags: if def_tag in ('DEF', 'INDEF'): test_det_def_tags.append(def_tag) correct, wrong = matches([mle] * len(test_det_def_tags), test_det_def_tags) return dict( total_articles_count=len(articles), # int total_token_count=total_token_count, # int train_count=len(train), # int test_count=len(test), # int kernel='MLE=%s' % mle, correct=correct, wrong=wrong, total=correct + wrong, )
def run_svm(article_count, feature_functions, kernel='polynomial', split=0.9, model_path='svm.model'): # https://bitbucket.org/wcauchois/pysvmlight articles, total_token_count = preprocess_wsj(article_count, feature_functions) dictionary = Dictionary() dictionary.add_one('ZZZZZ') # so that no features are labeled 0 data = [] for article in articles: for sentence in article: for tag, token_features in zip(sentence.def_tags, sentence.data): # only use def / indef tokens if tag in ('DEF', 'INDEF'): features = dictionary.add(token_features) features = sorted(list(set(features))) feature_values = zip(features, [1]*len(features)) data.append((+1 if tag == 'DEF' else -1, feature_values)) train, test = bifurcate(data, split, shuffle=True) # for corpus, name in [(train, 'train'), (test, 'test')]: # write_svm(corpus, 'wsj_svm-%s.data' % name) ##################### # do svm in Python... model = svmlight.learn(train, type='classification', kernel=kernel) # svmlight.learn options # type: select between 'classification', 'regression', 'ranking' (preference ranking), and 'optimization'. # kernel: select between 'linear', 'polynomial', 'rbf', and 'sigmoid'. # verbosity: set the verbosity level (default 0). # C: trade-off between training error and margin. # poly_degree: parameter d in polynomial kernel. # rbf_gamma: parameter gamma in rbf kernel. # coef_lin # coef_const # costratio (corresponds to -j option to svm_learn) svmlight.write_model(model, model_path) gold_labels, test_feature_values = zip(*test) # total = len(gold_labels) test_pairs = [(0, feature_values) for feature_values in test_feature_values] predictions = svmlight.classify(model, test_pairs) correct, wrong = matches( [(gold > 0) for gold in gold_labels], [(prediction > 0) for prediction in predictions]) return dict( total_articles_count=len(articles), # int total_token_count=total_token_count, # int train_count=len(train), # int test_count=len(test), # int kernel=kernel, correct=correct, wrong=wrong, total=correct + wrong, )
def run_crf(article_count, feature_functions, split=0.9, model_path='crf.model'): articles, total_token_count = preprocess_wsj(article_count, feature_functions) train, test = bifurcate(articles, split, shuffle=True) trainer = crf.Trainer() for article in train: for sentence in article: trainer.append_raw(sentence.data, sentence.def_tags) trainer.save(model_path) tagger = crf.Tagger(model_path) # results = defaultdict(list) # confusion_matrix = defaultdict(int) correct = 0 wrong = 0 for article in test: for sentence in article: gold_labels = sentence.def_tags predicted_labels = tagger.tag_raw(sentence.data) # for token, gold, predicted in zip(sentence.tokens, sentence.def_tags, predicted_tags): for gold, predicted in zip(gold_labels, predicted_labels): # key = (gold, predicted) # results[key] += [token] if gold in ('DEF', 'INDEF'): if gold == predicted: correct += 1 else: wrong += 1 # print 'Results' # for (predicted_label, gold_label), tokens in results.items(): # color = 'green' if predicted_label == gold_label else 'red' # cprint('%5d predicted=%s -> gold=%s' % (len(tokens), predicted_label, gold_label), color) # print ' ', Counter(tokens).most_common(20) return dict( total_articles_count=len(articles), # int total_token_count=total_token_count, # int train_count=len(train), # int test_count=len(test), # int kernel='CRF', correct=correct, wrong=wrong, total=correct + wrong)
def run_crf(article_count, feature_functions, split=0.9, model_path='crf.model'): articles, total_token_count = preprocess_wsj(article_count, feature_functions) train, test = bifurcate(articles, split, shuffle=True) trainer = crf.Trainer() for article in train: for sentence in article: trainer.append_raw(sentence.data, sentence.def_tags) trainer.save(model_path) tagger = crf.Tagger(model_path) # results = defaultdict(list) # confusion_matrix = defaultdict(int) correct = 0 wrong = 0 for article in test: for sentence in article: gold_labels = sentence.def_tags predicted_labels = tagger.tag_raw(sentence.data) # for token, gold, predicted in zip(sentence.tokens, sentence.def_tags, predicted_tags): for gold, predicted in zip(gold_labels, predicted_labels): # key = (gold, predicted) # results[key] += [token] if gold in ('DEF', 'INDEF'): if gold == predicted: correct += 1 else: wrong += 1 # print 'Results' # for (predicted_label, gold_label), tokens in results.items(): # color = 'green' if predicted_label == gold_label else 'red' # cprint('%5d predicted=%s -> gold=%s' % (len(tokens), predicted_label, gold_label), color) # print ' ', Counter(tokens).most_common(20) return dict( total_articles_count=len(articles), # int total_token_count=total_token_count, # int train_count=len(train), # int test_count=len(test), # int kernel='CRF', correct=correct, wrong=wrong, total=correct + wrong )
def do_hmm(documents, split): train, test = bifurcate(documents, split) # train does NOT accept generators tagger = HiddenMarkovModelTagger.train([doc.token_label_pairs() for doc in train]) results = defaultdict(list) for doc in test: predicted = tagger.tag(doc.tokens) gold_labels = doc.labels # precision = # recall = token_tag_pairs = nltk.pos_tag(doc.literal) print '\n-----------\n' + gloss(token_tag_pairs) for (token, predicted_label), gold_label in zip(predicted, gold_labels): results[(predicted_label, gold_label)] += [token]
def do_hmm(documents, split): train, test = bifurcate(documents, split) # train does NOT accept generators tagger = HiddenMarkovModelTagger.train( [doc.token_label_pairs() for doc in train]) results = defaultdict(list) for doc in test: predicted = tagger.tag(doc.tokens) gold_labels = doc.labels # precision = # recall = token_tag_pairs = nltk.pos_tag(doc.literal) print '\n-----------\n' + gloss(token_tag_pairs) for (token, predicted_label), gold_label in zip(predicted, gold_labels): results[(predicted_label, gold_label)] += [token]
def run_svm(article_count, feature_functions, kernel='polynomial', split=0.9, model_path='svm.model'): # https://bitbucket.org/wcauchois/pysvmlight articles, total_token_count = preprocess_wsj(article_count, feature_functions) dictionary = Dictionary() dictionary.add_one('ZZZZZ') # so that no features are labeled 0 data = [] for article in articles: for sentence in article: for tag, token_features in zip(sentence.def_tags, sentence.data): # only use def / indef tokens if tag in ('DEF', 'INDEF'): features = dictionary.add(token_features) features = sorted(list(set(features))) feature_values = zip(features, [1] * len(features)) data.append((+1 if tag == 'DEF' else -1, feature_values)) train, test = bifurcate(data, split, shuffle=True) # for corpus, name in [(train, 'train'), (test, 'test')]: # write_svm(corpus, 'wsj_svm-%s.data' % name) ##################### # do svm in Python... model = svmlight.learn(train, type='classification', kernel=kernel) # svmlight.learn options # type: select between 'classification', 'regression', 'ranking' (preference ranking), and 'optimization'. # kernel: select between 'linear', 'polynomial', 'rbf', and 'sigmoid'. # verbosity: set the verbosity level (default 0). # C: trade-off between training error and margin. # poly_degree: parameter d in polynomial kernel. # rbf_gamma: parameter gamma in rbf kernel. # coef_lin # coef_const # costratio (corresponds to -j option to svm_learn) svmlight.write_model(model, model_path) gold_labels, test_feature_values = zip(*test) # total = len(gold_labels) test_pairs = [(0, feature_values) for feature_values in test_feature_values] predictions = svmlight.classify(model, test_pairs) correct, wrong = matches([(gold > 0) for gold in gold_labels], [(prediction > 0) for prediction in predictions]) return dict( total_articles_count=len(articles), # int total_token_count=total_token_count, # int train_count=len(train), # int test_count=len(test), # int kernel=kernel, correct=correct, wrong=wrong, total=correct + wrong, )