示例#1
0
文件: eval.py 项目: famaf/PLN_2017
    sys.stdout.flush()


if __name__ == '__main__':
    opts = docopt(__doc__)

    # load the model
    filename = opts['-i']
    f = open(filename, 'rb')
    model = pickle.load(f)
    f.close()

    # Load the data
    files = '3LB-CAST/.*\.tbf\.xml'
    PATH = "./../../ancora-3.0.1es"
    corpus = SimpleAncoraCorpusReader(PATH, files)
    sents = list(corpus.tagged_sents())

    # Tag
    hits = 0
    total = 0

    # Hits Palabras conocidas
    hits_known_word = 0
    total_known_word = 0

    # Hits Palabras desconocidas
    hits_unknown_word = 0
    total_unknown_word = 0

    # Para Matriz de Confusion
示例#2
0
    print('\b' * width + msg, end='')
    sys.stdout.flush()


if __name__ == '__main__':
    opts = docopt(__doc__)

    print('Loading model...')
    filename = opts['-i']
    f = open(filename, 'rb')
    model = pickle.load(f)
    f.close()

    print('Loading corpus...')
    files = '3LB-CAST/.*\.tbf\.xml'
    corpus = SimpleAncoraCorpusReader('ancora/ancora-2.0/', files)
    parsed_sents = list(corpus.parsed_sents())

    print('Parsing...')
    hits, total_gold, total_model = 0, 0, 0
    n = len(parsed_sents)
    format_str = '{:3.1f}% ({}/{}) (P={:2.2f}%, R={:2.2f}%, F1={:2.2f}%)'
    progress(format_str.format(0.0, 0, n, 0.0, 0.0, 0.0))
    for i, gold_parsed_sent in enumerate(parsed_sents):
        tagged_sent = gold_parsed_sent.pos()

        # parse
        model_parsed_sent = model.parse(tagged_sent)

        # compute labeled scores
        gold_spans = spans(gold_parsed_sent, unary=False)
示例#3
0
    print('\b' * width + msg, end='')
    sys.stdout.flush()


if __name__ == '__main__':
    opts = docopt(__doc__)

    # load the model
    filename = opts['-i']
    f = open(filename, 'rb')
    model = pickle.load(f)
    f.close()

    # load the data
    files = '3LB-CAST/.*\.tbf\.xml'
    corpus = SimpleAncoraCorpusReader('ancora/ancora-3.0.1es/', files)
    sents = list(corpus.tagged_sents())

    # tag
    hits, total = 0, 0
    n = len(sents)
    for i, sent in enumerate(sents):
        word_sent, gold_tag_sent = zip(*sent)

        model_tag_sent = model.tag(word_sent)
        assert len(model_tag_sent) == len(gold_tag_sent), i

        # global score
        hits_sent = [m == g for m, g in zip(model_tag_sent, gold_tag_sent)]
        hits += sum(hits_sent)
        total += len(sent)
示例#4
0
from tagging.hmm import MLHMM
from tagging.memm import MEMM

models = {
    'base': BaselineTagger,
    'mlhmm': MLHMM,
    'memm': MEMM,
}

if __name__ == '__main__':
    opts = docopt(__doc__)

    # load the data
    files = 'CESS-CAST-(A|AA|P)/.*\.tbf\.xml'
    path = '/home/alangb/Escritorio/ancora-3.0.1es/'
    corpus = SimpleAncoraCorpusReader(path, files)
    sents = list(corpus.tagged_sents())

    # train the model
    chosen_model = models[opts['-m']]
    n = opts['-n']
    if n is None:
        assert chosen_model == 'base'
        model = chosen_model(sents)
    else:
        classifier = opts['-c']
        if classifier is None:
            model = chosen_model(int(n), sents)
        else:
            model = chosen_model(int(n), sents, classifier)
示例#5
0
  stats.py -h | --help

Options:
  -h --help     Show this screen.
"""
from docopt import docopt

from corpus.ancora import SimpleAncoraCorpusReader

from collections import defaultdict, Counter

if __name__ == '__main__':
    opts = docopt(__doc__)

    # load the data
    corpus = SimpleAncoraCorpusReader('ancora-3.0.1es/')
    sents = list(corpus.tagged_sents())

    words_tags = [(word, tag) for sent in sents for word, tag in sent]
    words, tags = zip(*words_tags)

    # basic statics
    sents_length = len(sents)
    total_words = len(words)
    vocabulary_words = len(set(words))
    vocabulary_tags = len(set(tags))

    # more frequency tags and ambiguity
    counter_words = Counter(words)
    dict_frecuent_words = defaultdict(dict)
    ambiguity = defaultdict(int)
示例#6
0
  stats.py
  stats.py -h | --help

Options:
  -h --help     Show this screen.
"""
from docopt import docopt
from corpus.ancora import SimpleAncoraCorpusReader
from collections import defaultdict

if __name__ == '__main__':
    opts = docopt(__doc__)

    # Cargamos el Ancora
    PATH = "./../../ancora-3.0.1es"
    corpus = SimpleAncoraCorpusReader(PATH)
    sents = list(corpus.tagged_sents())

    # Compute the statistics
    dict_words = defaultdict(int)  # Diccionario = word : cantidad
    dict_tags = defaultdict(int)  # Diccionario = word : cantidad

    # Diccionario = tag : {word : cantidad}
    dict_tag_word_count = defaultdict(lambda: defaultdict(int))

    # Diccionario = word : {tag : cantidad}
    dict_word_tag_count = defaultdict(lambda: defaultdict(int))

    for sent in sents:
        # sent es una lista de pares: (palabra, tag)
        for word, tag in sent:
示例#7
0
Options:
  -h --help     Show this screen.
"""

from operator import itemgetter as elem
from docopt import docopt
from collections import Counter, defaultdict
from corpus.ancora import SimpleAncoraCorpusReader

if __name__ == '__main__':
    opts = docopt(__doc__)

    # load the data
    path = '/home/alangb/Escritorio/ancora-3.0.1es/'
    corpus = SimpleAncoraCorpusReader(path)
    sents = list(corpus.tagged_sents())

    # compute the statistics

    # get words and tags
    words_tags = [word_tag for sent in sents for word_tag in sent]
    words, tags = zip(*words_tags)
    word_types = set(words)
    tag_types = set(tags)

    # calculate 10 most common tags
    common_tags = Counter(tags).most_common(10)

    # calculate 5 most common words
    # for each one of the most common tags
示例#8
0
  -h --help     Show this screen.
"""
# import sys
# sys.path.append("../../")

from docopt import docopt
from collections import defaultdict, Counter

from corpus.ancora import SimpleAncoraCorpusReader


if __name__ == '__main__':
    opts = docopt(__doc__)

    # load the data
    corpus = SimpleAncoraCorpusReader('corpus/ancora-2.0/')
    sents = list(corpus.tagged_sents())

    # compute the statistics
    words_len = 0
    word_freq = Counter()
    tag_freq = Counter()
    wordsby_tag = defaultdict(lambda: Counter())
    tagsby_word = defaultdict(lambda: defaultdict(int))

    for sent in sents:
        words_len += len(sent)
        for word, tag in sent:
            word_freq[word] += 1
            tag_freq[tag] += 1
            wordsby_tag[tag][word] += 1
示例#9
0
    'ct': ClassifierTagger,
}

clasifiers = {
    'multinomial': MultinomialNB,
    'linear': LinearSVC,
    'LogisticRegression': LogisticRegression
}

if __name__ == '__main__':
    opts = docopt(__doc__)

    # load the data
    files = 'CESS-CAST-(A|AA|P)/.*\.tbf\.xml'
    actual_dir = os.path.dirname(os.path.abspath(__file__))
    corpus = SimpleAncoraCorpusReader(actual_dir + '/corpus/ancora/', files)
    sents = list(corpus.tagged_sents())

    # train the model
    if opts['-n'] is not None:
        n = int(opts['-n'])
    m = opts['-m']
    c = opts['-c']
    wv_file = opts['-i']
    b = opts['-b'] == 'y'
    if m == 'ct':
        print("Model", m, "Training")
        model = models[m](wv_file=wv_file, is_bin=b)
        model.fit(sents)

    # save it
示例#10
0
def evaluate(model=None, matrix='n'):
    '''
    model --   The model trained that has been evaluated
    matrix --  If you want to generate the confusion matrix ('y') or not ('n')

    '''
    start = time()
    if model is None:
        opts = docopt(__doc__)
        matrix = opts['-m'] == 'y'

        # load the model
        filename = opts['-i']
        filename = 'Models/' + filename
        f = open(filename, 'rb')
        model = pickle.load(f)
        f.close()

    # load the data
    files = '3LB-CAST/.*\.tbf\.xml'
    actual_dir = os.path.dirname(os.path.abspath(__file__))

    corpus = SimpleAncoraCorpusReader(actual_dir + '/corpus/ancora/', files)
    sents = list(corpus.tagged_sents())
    n = len(sents)

    # tag
    hits, total = 0, 0
    hits_known, hits_unknown = 0, 0
    total_known, total_unknown = 0, 0
    are_known = []

    # confusion matrix
    test = []
    prediction = []

    for i, sent in enumerate(sents):
        word_sent, gold_tag_sent = zip(*sent)
        model_tag_sent = model.tag(word_sent).tolist()
        assert len(model_tag_sent) == len(gold_tag_sent), i
        # For confusion matrix
        test += list(gold_tag_sent)
        prediction += model_tag_sent

        # global score
        hits_sent = [m == g for m, g in zip(model_tag_sent, gold_tag_sent)]
        hits += sum(hits_sent)
        total += len(sent)
        total_acc = float(hits) / total

        # known words score
        for j in range(len(hits_sent)):
            # using the Counter method, descripted later, we have to asign
            # some values if are known or unknown and if are hit or not.
            if not model.unknown(word_sent[j]):
                are_known += [hits_sent[j] + 1]
            else:
                are_known += [hits_sent[j] - 2]

        progress('{:3.1f}% (Total: {:2.2f}%)'.format(
            float(i) * 100 / n, total_acc * 100))

    # For eficiency we will use the Counter object from collections
    # library.
    # We redefine some things to look for them later
    known = 2
    fail_known = 1
    unknown = -1
    fail_unknown = -2

    # Counter creates a dictionary whose keys are known, fail_known, unknown
    # and fail_unknown.
    counter = Counter(are_known)
    # Now get the values that represent how many times does apears each one
    hits_known += counter[known]
    total_known += counter[known] + counter[fail_known]

    hits_unknown += counter[unknown]
    total_unknown += counter[unknown] + counter[fail_unknown]

    # Compute accuracy
    total_acc = float(hits) / total
    known_acc = float(hits_known) / total_known
    unknown_acc = float(hits_unknown) / total_unknown
    finish = time() - start
    print('')
    print('Total accuracy: {:2.2f}%'.format(total_acc * 100))
    print('Known accuracy: {:2.2f}%'.format(known_acc * 100))
    print('Unknown accuracy: {:2.2f}%'.format(unknown_acc * 100))
    print('Time running: {:2.2f}seconds'.format(finish))

    if matrix:
        matrix = confusion_matrix(test, prediction)
        classes = list(set(test) | set(prediction))
        classes.sort()
        plot_confusion_matrix(matrix, classes, filename.split('.')[0] + '.png')