def train_and_trial(train_file, test_file, train_parse='', test_parse='', pickled=True, use_dep=False): global use_dep_parse if use_dep: use_dep_parse = True if pickled: f = open(train_file, 'rb') traind = cPickle.load(f) f.close() f = open(test_file, 'rb') testd = cPickle.load(f) f.close() else: traind = XMLParser.create_exs(train_file) testd = XMLParser.create_exs(test_file) posi_words = semeval_util.get_liu_lexicon('positive-words.txt') negi_words = semeval_util.get_liu_lexicon('negative-words.txt') print "should really use better dictionary for sentence senti labels" senti_dictionary = semeval_util.get_mpqa_lexicon() train_sentiment = [senti_classify(sent, posi_words, negi_words) for sent in traind['orig']] dep_parses = [] if use_dep_parse: dep_parses = semeval_util.add_dep_parse_features(traind['iob'], train_parse, dictionary=True, iobs=True) chunker = ConsecutiveChunkTagger(zip(traind['iob'],traind['polarity']), senti_dictionary, train_sentiment, dep_parses) print "done training" test_sentiment = [senti_classify(sent, posi_words, negi_words) for sent in testd['orig']] dep_parses = [[]] * len(test_sentiment) if use_dep_parse: dep_parses = semeval_util.add_dep_parse_features(testd['iob'], test_parse, dictionary=True, iobs=True) results = [] for i in range(len(test_sentiment)): results.append(chunker.parse((testd['iob'][i], test_sentiment[i], dep_parses[i]))) return results
def train_and_test(filename, parse_file, use_deps=False, posit_lex_file='positive-words.txt', nega_lex_file='negative-words.txt'): """Creates an 80/20 split of the examples in filename, trains the chunker on 80%, and evaluates the learned chunker on 20%. """ global use_dep_parse if use_deps: use_dep_parse = True traind = XMLParser.create_exs(filename) dep_parses = [] if use_dep_parse: dep_parses = semeval_util.add_dep_parse_features(traind['iob'], parse_file, dictionary=True, iobs=True) n = len(traind['iob']) split_size = int(n * 0.8) train = traind['iob'][:split_size] test = traind['iob'][split_size:] test_deps = [] if use_dep_parse: test_deps = dep_parses[split_size:] #Liu not in use for now #posi_words = semeval_util.get_liu_lexicon(posit_lex_file) #negi_words = semeval_util.get_liu_lexicon(nega_lex_file) senti_dictionary = semeval_util.get_mpqa_lexicon() chunker = ConsecutiveChunker(train, test, senti_dictionary, dep_parses) guessed_iobs = chunker.evaluate([test,test_deps]) semeval_util.compute_pr(test, guessed_iobs)
def train_and_test(filename, parse_file, use_deps=False, posit_lex_file='positive-words.txt', nega_lex_file='negative-words.txt'): """Creates an 80/20 split of the examples in filename, trains the chunker on 80%, and evaluates the learned chunker on 20%. """ global use_dep_parse if use_deps: use_dep_parse = True traind = XMLParser.create_exs(filename) dep_parses = [] if use_dep_parse: dep_parses = semeval_util.add_dep_parse_features(traind['iob'], parse_file, dictionary=True, iobs=True) n = len(traind['iob']) split_size = int(n * 0.8) train = traind['iob'][:split_size] test = traind['iob'][split_size:] test_deps = [] if use_dep_parse: test_deps = dep_parses[split_size:] #Liu not in use for now #posi_words = semeval_util.get_liu_lexicon(posit_lex_file) #negi_words = semeval_util.get_liu_lexicon(nega_lex_file) senti_dictionary = semeval_util.get_mpqa_lexicon() chunker = ConsecutiveChunker(train, test, senti_dictionary, dep_parses) guessed_iobs = chunker.evaluate([test, test_deps]) semeval_util.compute_pr(test, guessed_iobs)
def K_fold_train_and_test(filename, posit_lex_file='positive-words.txt', nega_lex_file='negative-words.txt', k=2, pickled=False): """Does K-fold cross-validation on the given filename """ if pickled: f = open(filename, 'rb') traind = cPickle.load(f) f.close() else: traind = XMLParser.create_exs(filename) n = len(traind['iob']) #posi_words = get_liu_lexicon(posit_lex_file) #negi_words = get_liu_lexicon(nega_lex_file) senti_dictionary = semeval_util.get_mpqa_lexicon() kf = cross_validation.KFold(n, n_folds=k, indices=True) tot_p, tot_r, tot_f1 = 0, 0, 0 for train, test in kf: print "next fold, split size: %d/%d" %(len(train), len(test)) #print train train_set = [] test_set = [] for i in train: train_set.append(traind['iob'][i]) for i in test: test_set.append(traind['iob'][i]) chunker = ConsecutiveChunker(train_set, senti_dictionary) guesses = chunker.evaluate(test_set) print test_set print guesses r, p, f = semeval_util.compute_pr(test_set, guesses) tot_p += p tot_r += r tot_f1 += f print "ave Prec: %.2f, Rec: %.2f, F1: %.2f" %(tot_p/float(k), tot_r/float(k), tot_f1/float(k))
def K_fold_train_and_test(filename, parse_file, use_dep=False, posit_lex_file='positive-words.txt', nega_lex_file='negative-words.txt', k=5, pickled=False): """Does K-fold cross-validation on the given filename """ global use_dep_parse if use_dep: print "using dependency parses" use_dep_parse = True if pickled: f = open(filename, 'rb') traind = cPickle.load(f) f.close() else: traind = XMLParser.create_exs(filename) n = len(traind['iob']) dep_parses = traind['iob'] if use_dep_parse: dep_parses = semeval_util.add_dep_parse_features(traind['iob'], parse_file, dictionary=True, iobs=True) #posi_words = semeval_util.get_liu_lexicon(posit_lex_file) #negi_words = semeval_util.get_liu_lexicon(nega_lex_file) senti_dictionary = semeval_util.get_mpqa_lexicon() kf = cross_validation.KFold(n, n_folds=k, indices=True) tot_p, tot_r, tot_f1 = 0, 0, 0 for train, test in kf: print "next fold, split size: %d/%d" % (len(train), len(test)) #print train train_set = [] test_set = [] train_parse = [] test_parse = [] for i in train: train_set.append(traind['iob'][i]) train_parse.append(dep_parses[i]) for i in test: test_set.append(traind['iob'][i]) test_parse.append(dep_parses[i]) chunker = ConsecutiveChunker(train_set, test_set, senti_dictionary, train_parse) guesses = chunker.evaluate([test_set, test_parse]) #print test_set #print guesses r, p, f = semeval_util.compute_pr(test_set, guesses) tot_p += p tot_r += r tot_f1 += f print "ave Prec: %.2f, Rec: %.2f, F1: %.2f" % (tot_p / float(k), tot_r / float(k), tot_f1 / float(k))
def just_train(train_file): f = open(train_file, 'rb') traind = cPickle.load(f) f.close() posi_words = semeval_util.get_liu_lexicon('positive-words.txt') negi_words = semeval_util.get_liu_lexicon('negative-words.txt') print "should really use better dictionary for sentence senti labels" senti_dictionary = semeval_util.get_mpqa_lexicon() train_sentiment = [senti_classify(sent, posi_words, negi_words) for sent in traind['orig']] ConsecutiveChunkTagger(zip(traind['iob'],traind['polarity']), senti_dictionary, train_sentiment, [])
def train_and_trial(trn_file, test_file, parse_file_train, parse_file_test, use_dep=False, posit_lex_file='positive-words.txt', nega_lex_file='negative-words.txt', pickled=False): """ Train on the training file and test on the testing file """ global use_dep_parse if use_dep: use_dep_parse = True if pickled: f = open(trn_file, 'rb') traind = cPickle.load(f) f.close() f = open(test_file, 'rb') testd = cPickle.load(f) f.close() else: traind = XMLParser.create_exs(trn_file) testd = XMLParser.create_exs(test_file) #posi_words = semeval_util.get_liu_lexicon(posit_lex_file) #negi_words = semeval_util.get_liu_lexicon(nega_lex_file) dep_parses = [] if use_dep_parse: dep_parses = semeval_util.add_dep_parse_features(traind['iob'], parse_file_train, dictionary=True, iobs=True) senti_dictionary = semeval_util.get_mpqa_lexicon() chunker = ConsecutiveChunker(traind['iob'], testd['iob'], senti_dictionary, dep_parses) print "done training on %d examples" % len(traind['iob']) ''' f = open('learned.pkl','wb') cPickle.dump(chunker,f) f.close() ''' if use_dep_parse: dep_parses = semeval_util.add_dep_parse_features(traind['iob'], parse_file_test, dictionary=True, iobs=True) guessed_iobs = chunker.evaluate([testd['iob'], dep_parses]) ###semeval_util.compute_pr(testd['iob'], guessed_iobs) return guessed_iobs
def k_fold(filename, parse_filename, k=5, pickled=True, use_dep=False): global use_dep_parse if use_dep: use_dep_parse = True if pickled: f = open(filename, 'rb') traind = cPickle.load(f) f.close() else: traind = XMLParser.create_exs(filename) n = len(traind['iob']) posi_words = semeval_util.get_liu_lexicon('positive-words.txt') negi_words = semeval_util.get_liu_lexicon('negative-words.txt') senti_dictionary = semeval_util.get_mpqa_lexicon() full_senti_label = [senti_classify(sentence, posi_words, negi_words) for sentence in traind['orig']] dep_parses = [[]] * n if use_dep_parse: dep_parses = semeval_util.add_dep_parse_features(traind['iob'], parse_filename, dictionary=True, iobs=True) kf = cross_validation.KFold(n, n_folds=k, indices=True) tot_acc = 0. for train, test in kf: print "next fold, split size: %d/%d" %(len(train), len(test)) #print train train_set = [] train_sentis = [] train_parse = [] test_set = [] test_sentis = [] test_parse = [] for i in train: train_set.append((traind['iob'][i], traind['polarity'][i])) train_sentis.append(full_senti_label[i]) train_parse.append(dep_parses[i]) for i in test: test_set.append((traind['iob'][i], traind['polarity'][i])) test_sentis.append((full_senti_label[i])) test_parse.append(dep_parses[i]) chunker = ConsecutiveChunkTagger(train_set, senti_dictionary, train_sentis, train_parse) acc = chunker.evaluate(zip(test_set, test_sentis, test_parse)) print "acc:", acc tot_acc += acc print "average acc:", tot_acc/k
def train_and_test(filename, parse_file, posit_lex_file='positive-words.txt', nega_lex_file='negative-words.txt', pickled=False, use_dep=False): """Creates an 80/20 split of the examples in filename, trains the sentiment classifier on 80%, and evaluates the learned classifier on 20%. """ global use_dep_parse if use_dep: use_dep_parse = True if pickled: f = open(filename, 'rb') traind = cPickle.load(f) f.close() else: traind = XMLParser.create_exs(filename) n = len(traind['iob']) split_size = int(n * 0.8) train = zip(traind['iob'][:split_size], traind['polarity'][:split_size]) test = zip(traind['iob'][split_size:], traind['polarity'][split_size:]) posi_words = semeval_util.get_liu_lexicon(posit_lex_file) negi_words = semeval_util.get_liu_lexicon(nega_lex_file) senti_dictionary = semeval_util.get_mpqa_lexicon() full_senti_label = [senti_classify(sentence, posi_words, negi_words) for sentence in traind['orig']] dep_parses = [] if use_dep_parse: dep_parses = semeval_util.add_dep_parse_features(traind['iob'], parse_file, dictionary=True, iobs=True) print "first dep_parse:", dep_parses[0] print "first train ex:", train[0] print "size parses all:", len(dep_parses), "vs train:", len(dep_parses[:split_size]) chunker = ConsecutiveChunkTagger(train, senti_dictionary, full_senti_label, dep_parses[:split_size]) print "done training" if use_dep_parse: dep_parses = dep_parses[split_size:] print "first test dep parse:", dep_parses[0] print "first test ex:", test[0] else: #artifact of using zip, even if not using parses, need to have same # of elements in all lists dep_parses = [[]] * split_size print chunker.evaluate(zip(test, full_senti_label[split_size:], dep_parses))
def train_and_test(filename, posit_lex_file='positive-words.txt', nega_lex_file='negative-words.txt', pickled=False): """Creates an 80/20 split of the examples in filename, trains the chunker on 80%, and evaluates the learned chunker on 20%. """ if pickled: f = open(filename, 'rb') traind = cPickle.load(f) f.close() else: traind = XMLParser.create_exs(filename) n = len(traind['iob']) split_size = int(n * 0.8) train = traind['iob'][:split_size] test = traind['iob'][split_size:] #posi_words = get_liu_lexicon(posit_lex_file) #negi_words = get_liu_lexicon(nega_lex_file) senti_dictionary = semeval_util.get_mpqa_lexicon() chunker = ConsecutiveChunker() chunker.train(train, senti_dictionary) guessed_iobs = chunker.evaluate(test) semeval_util.compute_pr(test, guessed_iobs)
print "acc:", acc tot_acc += acc print "average acc:", tot_acc/k if __name__ == '__main__': #f = file('obj.save', 'wb') #cPickle.dump(my_obj, f, protocol=cPickle.HIGHEST_PROTOCOL) #f.close() f = open('../PycharmProjects/emnlp/Rest_train_v2.pkl', 'rb') traind = cPickle.load(f) f.close() posi_words = semeval_util.get_liu_lexicon('positive-words.txt') negi_words = semeval_util.get_liu_lexicon('negative-words.txt') senti_dictionary = semeval_util.get_mpqa_lexicon() full_senti_label = [senti_classify(sentence, posi_words, negi_words) for sentence in traind['orig']] #split_size = int(len(traind['orig']) * .25) split_size = len(traind['orig']) #subset = zip(traind['iob'][:split_size], traind['polarity'][:split_size]) tp, tneg, tneutr = 0., 0., 0. fnn = 0. missed_neut, fpn = 0., 0. wrong_empties = 0. for i in range(split_size): labels = traind['polarity'][i] if len(labels) > 0: senti = 0 for l in labels: if l=='positive': senti += 1