Пример #1
0
def train_and_trial(train_file, test_file, train_parse='', test_parse='', pickled=True, use_dep=False):
    global use_dep_parse
    if use_dep:
        use_dep_parse = True
    if pickled:
        f = open(train_file, 'rb')
        traind = cPickle.load(f)
        f.close()
        f = open(test_file, 'rb')
        testd = cPickle.load(f)
        f.close()
    else:
        traind = XMLParser.create_exs(train_file)
        testd = XMLParser.create_exs(test_file)
    posi_words = semeval_util.get_liu_lexicon('positive-words.txt')
    negi_words = semeval_util.get_liu_lexicon('negative-words.txt')
    print "should really use better dictionary for sentence senti labels"
    senti_dictionary = semeval_util.get_mpqa_lexicon()
    train_sentiment = [senti_classify(sent, posi_words, negi_words) for sent in traind['orig']]

    dep_parses = []
    if use_dep_parse:
        dep_parses = semeval_util.add_dep_parse_features(traind['iob'], train_parse, dictionary=True, iobs=True)
    chunker = ConsecutiveChunkTagger(zip(traind['iob'],traind['polarity']), senti_dictionary,
                                     train_sentiment, dep_parses)
    print "done training"
    test_sentiment = [senti_classify(sent, posi_words, negi_words) for sent in testd['orig']]
    dep_parses = [[]] * len(test_sentiment)
    if use_dep_parse:
        dep_parses = semeval_util.add_dep_parse_features(testd['iob'], test_parse, dictionary=True, iobs=True)
    results = []
    for i in range(len(test_sentiment)):
        results.append(chunker.parse((testd['iob'][i], test_sentiment[i], dep_parses[i])))
    return results
Пример #2
0
def train_and_test(filename, parse_file, use_deps=False,
                   posit_lex_file='positive-words.txt', nega_lex_file='negative-words.txt'):
    """Creates an 80/20 split of the examples in filename,
    trains the chunker on 80%, and evaluates the learned chunker on 20%.
    """
    global use_dep_parse
    if use_deps:
        use_dep_parse = True
    traind = XMLParser.create_exs(filename)
    dep_parses = []
    if use_dep_parse:
        dep_parses = semeval_util.add_dep_parse_features(traind['iob'], parse_file, dictionary=True, iobs=True)
    n = len(traind['iob'])
    split_size = int(n * 0.8)
    train = traind['iob'][:split_size]
    test = traind['iob'][split_size:]
    test_deps = []
    if use_dep_parse:
        test_deps = dep_parses[split_size:]
    #Liu not in use for now
    #posi_words = semeval_util.get_liu_lexicon(posit_lex_file)
    #negi_words = semeval_util.get_liu_lexicon(nega_lex_file)
    senti_dictionary = semeval_util.get_mpqa_lexicon()
    chunker = ConsecutiveChunker(train, test, senti_dictionary, dep_parses)
    guessed_iobs = chunker.evaluate([test,test_deps])
    semeval_util.compute_pr(test, guessed_iobs)
Пример #3
0
def train_and_test(filename,
                   parse_file,
                   use_deps=False,
                   posit_lex_file='positive-words.txt',
                   nega_lex_file='negative-words.txt'):
    """Creates an 80/20 split of the examples in filename,
    trains the chunker on 80%, and evaluates the learned chunker on 20%.
    """
    global use_dep_parse
    if use_deps:
        use_dep_parse = True
    traind = XMLParser.create_exs(filename)
    dep_parses = []
    if use_dep_parse:
        dep_parses = semeval_util.add_dep_parse_features(traind['iob'],
                                                         parse_file,
                                                         dictionary=True,
                                                         iobs=True)
    n = len(traind['iob'])
    split_size = int(n * 0.8)
    train = traind['iob'][:split_size]
    test = traind['iob'][split_size:]
    test_deps = []
    if use_dep_parse:
        test_deps = dep_parses[split_size:]
    #Liu not in use for now
    #posi_words = semeval_util.get_liu_lexicon(posit_lex_file)
    #negi_words = semeval_util.get_liu_lexicon(nega_lex_file)
    senti_dictionary = semeval_util.get_mpqa_lexicon()
    chunker = ConsecutiveChunker(train, test, senti_dictionary, dep_parses)
    guessed_iobs = chunker.evaluate([test, test_deps])
    semeval_util.compute_pr(test, guessed_iobs)
Пример #4
0
def K_fold_train_and_test(filename, posit_lex_file='positive-words.txt', nega_lex_file='negative-words.txt', k=2, pickled=False):
    """Does K-fold cross-validation on the given filename
    """
    if pickled:
        f = open(filename, 'rb')
        traind = cPickle.load(f)
        f.close()
    else:
        traind = XMLParser.create_exs(filename)
    n = len(traind['iob'])
    #posi_words = get_liu_lexicon(posit_lex_file)
    #negi_words = get_liu_lexicon(nega_lex_file)
    senti_dictionary = semeval_util.get_mpqa_lexicon()
    kf = cross_validation.KFold(n, n_folds=k, indices=True)
    tot_p, tot_r, tot_f1 = 0, 0, 0
    for train, test in kf:
        print "next fold, split size: %d/%d" %(len(train), len(test))
        #print train
        train_set = []
        test_set = []
        for i in train:
            train_set.append(traind['iob'][i])
        for i in test:
            test_set.append(traind['iob'][i])
        chunker = ConsecutiveChunker(train_set, senti_dictionary)
        guesses = chunker.evaluate(test_set)
        print test_set
        print guesses
        r, p, f = semeval_util.compute_pr(test_set, guesses)
        tot_p += p
        tot_r += r
        tot_f1 += f
    print "ave Prec: %.2f, Rec: %.2f, F1: %.2f" %(tot_p/float(k), tot_r/float(k), tot_f1/float(k))
Пример #5
0
def K_fold_train_and_test(filename,
                          parse_file,
                          use_dep=False,
                          posit_lex_file='positive-words.txt',
                          nega_lex_file='negative-words.txt',
                          k=5,
                          pickled=False):
    """Does K-fold cross-validation on the given filename
    """
    global use_dep_parse
    if use_dep:
        print "using dependency parses"
        use_dep_parse = True
    if pickled:
        f = open(filename, 'rb')
        traind = cPickle.load(f)
        f.close()
    else:
        traind = XMLParser.create_exs(filename)
    n = len(traind['iob'])
    dep_parses = traind['iob']
    if use_dep_parse:
        dep_parses = semeval_util.add_dep_parse_features(traind['iob'],
                                                         parse_file,
                                                         dictionary=True,
                                                         iobs=True)
    #posi_words = semeval_util.get_liu_lexicon(posit_lex_file)
    #negi_words = semeval_util.get_liu_lexicon(nega_lex_file)
    senti_dictionary = semeval_util.get_mpqa_lexicon()
    kf = cross_validation.KFold(n, n_folds=k, indices=True)
    tot_p, tot_r, tot_f1 = 0, 0, 0
    for train, test in kf:
        print "next fold, split size: %d/%d" % (len(train), len(test))
        #print train
        train_set = []
        test_set = []
        train_parse = []
        test_parse = []
        for i in train:
            train_set.append(traind['iob'][i])
            train_parse.append(dep_parses[i])
        for i in test:
            test_set.append(traind['iob'][i])
            test_parse.append(dep_parses[i])
        chunker = ConsecutiveChunker(train_set, test_set, senti_dictionary,
                                     train_parse)
        guesses = chunker.evaluate([test_set, test_parse])
        #print test_set
        #print guesses
        r, p, f = semeval_util.compute_pr(test_set, guesses)
        tot_p += p
        tot_r += r
        tot_f1 += f
    print "ave Prec: %.2f, Rec: %.2f, F1: %.2f" % (tot_p / float(k), tot_r /
                                                   float(k), tot_f1 / float(k))
Пример #6
0
def just_train(train_file):
    f = open(train_file, 'rb')
    traind = cPickle.load(f)
    f.close()
    posi_words = semeval_util.get_liu_lexicon('positive-words.txt')
    negi_words = semeval_util.get_liu_lexicon('negative-words.txt')
    print "should really use better dictionary for sentence senti labels"
    senti_dictionary = semeval_util.get_mpqa_lexicon()
    train_sentiment = [senti_classify(sent, posi_words, negi_words) for sent in traind['orig']]
    ConsecutiveChunkTagger(zip(traind['iob'],traind['polarity']), senti_dictionary,
                                     train_sentiment, [])
Пример #7
0
def train_and_trial(trn_file,
                    test_file,
                    parse_file_train,
                    parse_file_test,
                    use_dep=False,
                    posit_lex_file='positive-words.txt',
                    nega_lex_file='negative-words.txt',
                    pickled=False):
    """ Train on the training file and test on the testing file
    """
    global use_dep_parse
    if use_dep:
        use_dep_parse = True
    if pickled:
        f = open(trn_file, 'rb')
        traind = cPickle.load(f)
        f.close()
        f = open(test_file, 'rb')
        testd = cPickle.load(f)
        f.close()
    else:
        traind = XMLParser.create_exs(trn_file)
        testd = XMLParser.create_exs(test_file)
    #posi_words = semeval_util.get_liu_lexicon(posit_lex_file)
    #negi_words = semeval_util.get_liu_lexicon(nega_lex_file)
    dep_parses = []
    if use_dep_parse:
        dep_parses = semeval_util.add_dep_parse_features(traind['iob'],
                                                         parse_file_train,
                                                         dictionary=True,
                                                         iobs=True)
    senti_dictionary = semeval_util.get_mpqa_lexicon()
    chunker = ConsecutiveChunker(traind['iob'], testd['iob'], senti_dictionary,
                                 dep_parses)
    print "done training on %d examples" % len(traind['iob'])
    '''
    f = open('learned.pkl','wb')
    cPickle.dump(chunker,f)
    f.close()
    '''
    if use_dep_parse:
        dep_parses = semeval_util.add_dep_parse_features(traind['iob'],
                                                         parse_file_test,
                                                         dictionary=True,
                                                         iobs=True)

    guessed_iobs = chunker.evaluate([testd['iob'], dep_parses])
    ###semeval_util.compute_pr(testd['iob'], guessed_iobs)
    return guessed_iobs
Пример #8
0
def k_fold(filename, parse_filename, k=5, pickled=True, use_dep=False):
    global use_dep_parse
    if use_dep:
        use_dep_parse = True

    if pickled:
        f = open(filename, 'rb')
        traind = cPickle.load(f)
        f.close()
    else:
        traind = XMLParser.create_exs(filename)
    n = len(traind['iob'])
    posi_words = semeval_util.get_liu_lexicon('positive-words.txt')
    negi_words = semeval_util.get_liu_lexicon('negative-words.txt')
    senti_dictionary = semeval_util.get_mpqa_lexicon()

    full_senti_label = [senti_classify(sentence, posi_words, negi_words) for sentence in traind['orig']]
    dep_parses = [[]] * n
    if use_dep_parse:
        dep_parses = semeval_util.add_dep_parse_features(traind['iob'], parse_filename, dictionary=True, iobs=True)
    kf = cross_validation.KFold(n, n_folds=k, indices=True)
    tot_acc = 0.
    for train, test in kf:
        print "next fold, split size: %d/%d" %(len(train), len(test))
        #print train
        train_set = []
        train_sentis = []
        train_parse = []

        test_set = []
        test_sentis = []
        test_parse = []
        for i in train:
            train_set.append((traind['iob'][i], traind['polarity'][i]))
            train_sentis.append(full_senti_label[i])
            train_parse.append(dep_parses[i])
        for i in test:
            test_set.append((traind['iob'][i], traind['polarity'][i]))
            test_sentis.append((full_senti_label[i]))
            test_parse.append(dep_parses[i])
        chunker = ConsecutiveChunkTagger(train_set, senti_dictionary, train_sentis, train_parse)
        acc = chunker.evaluate(zip(test_set, test_sentis, test_parse))
        print "acc:", acc
        tot_acc += acc
    print "average acc:", tot_acc/k
Пример #9
0
def train_and_test(filename, parse_file,
                   posit_lex_file='positive-words.txt', nega_lex_file='negative-words.txt',
                   pickled=False, use_dep=False):
    """Creates an 80/20 split of the examples in filename,
    trains the sentiment classifier on 80%, and evaluates the learned classifier on 20%.
    """
    global use_dep_parse
    if use_dep:
        use_dep_parse = True
    if pickled:
        f = open(filename, 'rb')
        traind = cPickle.load(f)
        f.close()
    else:
        traind = XMLParser.create_exs(filename)
    n = len(traind['iob'])
    split_size = int(n * 0.8)
    train = zip(traind['iob'][:split_size], traind['polarity'][:split_size])
    test = zip(traind['iob'][split_size:], traind['polarity'][split_size:])
    posi_words = semeval_util.get_liu_lexicon(posit_lex_file)
    negi_words = semeval_util.get_liu_lexicon(nega_lex_file)
    senti_dictionary = semeval_util.get_mpqa_lexicon()

    full_senti_label = [senti_classify(sentence, posi_words, negi_words) for sentence in traind['orig']]

    dep_parses = []
    if use_dep_parse:
        dep_parses = semeval_util.add_dep_parse_features(traind['iob'], parse_file, dictionary=True, iobs=True)
        print "first dep_parse:", dep_parses[0]
        print "first train ex:", train[0]
        print "size parses all:", len(dep_parses), "vs train:", len(dep_parses[:split_size])

    chunker = ConsecutiveChunkTagger(train, senti_dictionary, full_senti_label, dep_parses[:split_size])
    print "done training"

    if use_dep_parse:
        dep_parses = dep_parses[split_size:]
        print "first test dep parse:", dep_parses[0]
        print "first test ex:", test[0]
    else:
        #artifact of using zip, even if not using parses, need to have same # of elements in all lists
        dep_parses = [[]] * split_size
    print chunker.evaluate(zip(test, full_senti_label[split_size:], dep_parses))
Пример #10
0
def train_and_test(filename, posit_lex_file='positive-words.txt', nega_lex_file='negative-words.txt', pickled=False):
    """Creates an 80/20 split of the examples in filename,
    trains the chunker on 80%, and evaluates the learned chunker on 20%.
    """
    if pickled:
        f = open(filename, 'rb')
        traind = cPickle.load(f)
        f.close()
    else:
        traind = XMLParser.create_exs(filename)
    n = len(traind['iob'])
    split_size = int(n * 0.8)
    train = traind['iob'][:split_size]
    test = traind['iob'][split_size:]
    #posi_words = get_liu_lexicon(posit_lex_file)
    #negi_words = get_liu_lexicon(nega_lex_file)
    senti_dictionary = semeval_util.get_mpqa_lexicon()
    chunker = ConsecutiveChunker()
    chunker.train(train, senti_dictionary)
    guessed_iobs = chunker.evaluate(test)
    semeval_util.compute_pr(test, guessed_iobs)
Пример #11
0
def train_and_test(filename,
                   posit_lex_file='positive-words.txt',
                   nega_lex_file='negative-words.txt',
                   pickled=False):
    """Creates an 80/20 split of the examples in filename,
    trains the chunker on 80%, and evaluates the learned chunker on 20%.
    """
    if pickled:
        f = open(filename, 'rb')
        traind = cPickle.load(f)
        f.close()
    else:
        traind = XMLParser.create_exs(filename)
    n = len(traind['iob'])
    split_size = int(n * 0.8)
    train = traind['iob'][:split_size]
    test = traind['iob'][split_size:]
    #posi_words = get_liu_lexicon(posit_lex_file)
    #negi_words = get_liu_lexicon(nega_lex_file)
    senti_dictionary = semeval_util.get_mpqa_lexicon()
    chunker = ConsecutiveChunker()
    chunker.train(train, senti_dictionary)
    guessed_iobs = chunker.evaluate(test)
    semeval_util.compute_pr(test, guessed_iobs)
Пример #12
0
def train_and_trial(trn_file, test_file, parse_file_train, parse_file_test, use_dep=False,
                    posit_lex_file='positive-words.txt', nega_lex_file='negative-words.txt', pickled=False):
    """ Train on the training file and test on the testing file
    """
    global use_dep_parse
    if use_dep:
        use_dep_parse = True
    if pickled:
        f = open(trn_file, 'rb')
        traind = cPickle.load(f)
        f.close()
        f = open(test_file, 'rb')
        testd = cPickle.load(f)
        f.close()
    else:
        traind = XMLParser.create_exs(trn_file)
        testd = XMLParser.create_exs(test_file)
    #posi_words = semeval_util.get_liu_lexicon(posit_lex_file)
    #negi_words = semeval_util.get_liu_lexicon(nega_lex_file)
    dep_parses = []
    if use_dep_parse:
        dep_parses = semeval_util.add_dep_parse_features(traind['iob'], parse_file_train, dictionary=True, iobs=True)
    senti_dictionary = semeval_util.get_mpqa_lexicon()
    chunker = ConsecutiveChunker(traind['iob'], testd['iob'], senti_dictionary, dep_parses)
    print "done training on %d examples" % len(traind['iob'])
    '''
    f = open('learned.pkl','wb')
    cPickle.dump(chunker,f)
    f.close()
    '''
    if use_dep_parse:
        dep_parses = semeval_util.add_dep_parse_features(traind['iob'], parse_file_test, dictionary=True, iobs=True)

    guessed_iobs = chunker.evaluate([testd['iob'], dep_parses])
    ###semeval_util.compute_pr(testd['iob'], guessed_iobs)
    return guessed_iobs
Пример #13
0
        print "acc:", acc
        tot_acc += acc
    print "average acc:", tot_acc/k


if __name__ == '__main__':
    #f = file('obj.save', 'wb')
    #cPickle.dump(my_obj, f, protocol=cPickle.HIGHEST_PROTOCOL)
    #f.close()
    f = open('../PycharmProjects/emnlp/Rest_train_v2.pkl', 'rb')
    traind = cPickle.load(f)
    f.close()

    posi_words = semeval_util.get_liu_lexicon('positive-words.txt')
    negi_words = semeval_util.get_liu_lexicon('negative-words.txt')
    senti_dictionary = semeval_util.get_mpqa_lexicon()
    full_senti_label = [senti_classify(sentence, posi_words, negi_words) for sentence in traind['orig']]
    #split_size = int(len(traind['orig']) * .25)
    split_size = len(traind['orig'])
    #subset = zip(traind['iob'][:split_size], traind['polarity'][:split_size])
    tp, tneg, tneutr = 0., 0., 0.
    fnn = 0.
    missed_neut, fpn = 0., 0.
    wrong_empties = 0.
    for i in range(split_size):
        labels = traind['polarity'][i]
        if len(labels) > 0:
            senti = 0
            for l in labels:
                if l=='positive':
                    senti += 1