def build_corpus_features():
    corpus = pcc.PostagCorpus()
    train_seq = corpus.read_sequence_list_conll(data.find('train-02-21.conll'),
        max_sent_len=MAX_SENT_SIZE,
        max_nr_sent=MAX_NR_SENTENCES)
    corpus.add_sequence_list(train_seq)
    dev_seq = corpus.read_sequence_list_conll(data.find('dev-22.conll'))
    corpus.add_sequence_list(dev_seq)
    categories = [
        'adventure',
        'belles_lettres',
        'editorial',
        'fiction',
        'government',
        'hobbies',
        'humor',
        'learned',
        'lore',
        'mystery',
        'news',
        'religion',
        'reviews',
        'romance']
    for cat in categories:
        brown_seq = corpus.read_sequence_list_brown(categories=cat)
        corpus.add_sequence_list(brown_seq)
    features = exfc.ExtendedFeatures(corpus)
    features.build_features()
    corpus.save_corpus(MODEL_DIR)
    features.save_features(MODEL_DIR+"features.txt")
    return corpus, features
示例#2
0
def eval_model(corpus, features, model):
    dev_seq = corpus.read_sequence_list_conll(data.find('dev-22.conll'))
    pred_dev = model.viterbi_decode_corpus_log(dev_seq.seq_list)
    eval_dev = model.evaluate_corpus(dev_seq.seq_list, pred_dev)
    print("Accuracy on wsj development %f" % eval_dev)
    test_seq = corpus.read_sequence_list_conll(data.find('test-23.conll'))
    pred_test = model.viterbi_decode_corpus_log(test_seq.seq_list)
    eval_test = model.evaluate_corpus(test_seq.seq_list, pred_test)
    print("Accuracy on wsj test %f" % eval_test)
def corpus_and_sequences():
    corpus = pcc.PostagCorpus()
    train_seq = corpus.read_sequence_list_conll(data.find('train-02-21.conll'),
                                                max_sent_len=10,
                                                max_nr_sent=1000)
    dev_seq = corpus.read_sequence_list_conll(data.find('dev-22.conll'),
                                              max_sent_len=10,
                                              max_nr_sent=1000)
    test_seq = corpus.read_sequence_list_conll(data.find('test-23.conll'),
                                               max_sent_len=10,
                                               max_nr_sent=1000)
    return corpus, train_seq, dev_seq, test_seq
示例#4
0
def build_corpus_features():
    corpus = pcc.PostagCorpus()
    train_seq = corpus.read_sequence_list_conll(data.find('train-02-21.conll'),
        max_sent_len=MAX_SENT_SIZE,
        max_nr_sent=MAX_NR_SENTENCES)
    corpus.add_sequence_list(train_seq)
    features = exfc.ExtendedFeatures(corpus)
    features.build_features()
    corpus.save_corpus(MODEL_DIR)
    features.save_features(MODEL_DIR + "features.txt")
    return corpus, features
示例#5
0
    def __init__(self, **config):

        corpus = PostagCorpus()
        train_seq = corpus.read_sequence_list_conll(data.find('train-02-21.conll'), max_sent_len=15, max_nr_sent=1000)
        dev_seq = corpus.read_sequence_list_conll(data.find('dev-22.conll'), max_sent_len=15, max_nr_sent=1000)
        test_seq = corpus.read_sequence_list_conll(data.find('test-23.conll'), max_sent_len=15, max_nr_sent=1000)

        # Redo indices so that they are consecutive. Also cast all data to numpy arrays
        # of int32 for compatibility with GPUs and theano and add reverse index
        train_seq, test_seq, dev_seq = compacify(train_seq, test_seq, dev_seq, theano=True)

        # Get number of words and tags in the corpus
        self.input_size = len(train_seq.x_dict)
        self.output_size = len(train_seq.y_dict)

        # Data-sets
        self.datasets = {
            'train': {
                'input': [np.array(seq.x) for seq in train_seq],
                'output': [np.array(seq.y) for seq in train_seq]
            },
            'dev': {
                'input': [np.array(seq.x) for seq in dev_seq],
                'output': [np.array(seq.y) for seq in dev_seq]
            },
            'test': {
                'input': [np.array(seq.x) for seq in test_seq],
                'output': [np.array(seq.y) for seq in test_seq]
            }
        }
        # Config
        self.config = config
        # Number of samples
        self.nr_samples = {
           sset: len(content['output'])
           for sset, content in self.datasets.items()
        }
        self.maxL = max(chain(*[[len(seq) for seq in content['input']] for content in self.datasets.values()]))
        return
示例#6
0
    def __init__(self, **config):

        corpus = PostagCorpus()
        train_seq = corpus.read_sequence_list_conll(data.find('train-02-21.conll'), max_sent_len=15, max_nr_sent=1000)
        dev_seq = corpus.read_sequence_list_conll(data.find('dev-22.conll'), max_sent_len=15, max_nr_sent=1000)
        test_seq = corpus.read_sequence_list_conll(data.find('test-23.conll'), max_sent_len=15, max_nr_sent=1000)

        # Redo indices so that they are consecutive. Also cast all data to numpy arrays
        # of int32 for compatibility with GPUs and theano and add reverse index
        train_seq, test_seq, dev_seq = compacify(train_seq, test_seq, dev_seq, theano=True)

        # Get number of words and tags in the corpus
        self.input_size = len(train_seq.x_dict)
        self.output_size = len(train_seq.y_dict)

        # Data-sets
        self.datasets = {
            'train': {
                'input': [np.array(seq.x) for seq in train_seq],
                'output': [np.array(seq.y) for seq in train_seq]
            },
            'dev': {
                'input': [np.array(seq.x) for seq in dev_seq],
                'output': [np.array(seq.y) for seq in dev_seq]
            },
            'test': {
                'input': [np.array(seq.x) for seq in test_seq],
                'output': [np.array(seq.y) for seq in test_seq]
            }
        }
        # Config
        self.config = config
        # Number of samples
        self.nr_samples = {
           sset: len(content['output'])
           for sset, content in self.datasets.items()
        }
示例#7
0
def download_embeddings(embbeding_name, target_file):
    '''
    Downloads file through http with progress report

    Obtained in stack overflow:
    http://stackoverflow.com/questions/22676/how-do-i-download-a-file-over-http
    -using-python
    '''

    # Embedding download URLs
    if embbeding_name == 'senna_50':
        # senna_50 embeddings
        source_url = 'http://lxmls.it.pt/2015/wp-content/uploads/2015/senna_50'
    else:
        raise ValueError("I do not have embeddings %s for download"
                           % embbeding_name)

    target_file_name = os.path.basename(data.find('senna_50'))
    u = urllib.request.urlopen(source_url)
    with open(target_file, 'wb') as f:
        meta         = u.info()
        file_size    = int(meta.getheaders("Content-Length")[0])
        file_size_dl = 0
        block_sz     = 8192
        print("Downloading: %s Bytes: %s" % (target_file_name, file_size))
        while True:
            text_buffer = u.read(block_sz)
            if not text_buffer:
                break
            file_size_dl += len(text_buffer)
            f.write(text_buffer)
            status = r"%10d  [%3.2f%%]" % (file_size_dl,
                                           file_size_dl*100./file_size)
            status = status + chr(8)*(len(status)+1)
            print(status)
    print("")
示例#8
0
def download_embeddings(embbeding_name, target_file):
    '''
    Downloads file through http with progress report

    Obtained in stack overflow:
    http://stackoverflow.com/questions/22676/how-do-i-download-a-file-over-http
    -using-python
    '''

    # Embedding download URLs
    if embbeding_name == 'senna_50':
        # senna_50 embeddings
        source_url = 'http://lxmls.it.pt/2015/wp-content/uploads/2015/senna_50'
    else:
        raise ValueError("I do not have embeddings %s for download" %
                         embbeding_name)

    target_file_name = os.path.basename(data.find('senna_50'))
    u = urllib.request.urlopen(source_url)
    with open(target_file, 'wb') as f:
        meta = u.info()
        file_size = int(meta.getheaders("Content-Length")[0])
        file_size_dl = 0
        block_sz = 8192
        print("Downloading: %s Bytes: %s" % (target_file_name, file_size))
        while True:
            text_buffer = u.read(block_sz)
            if not text_buffer:
                break
            file_size_dl += len(text_buffer)
            f.write(text_buffer)
            status = r"%10d  [%3.2f%%]" % (file_size_dl,
                                           file_size_dl * 100. / file_size)
            status = status + chr(8) * (len(status) + 1)
            print(status)
    print("")
示例#9
0
import codecs
import gzip
from lxmls.sequences.label_dictionary import *
from lxmls.sequences.sequence import *
from lxmls.sequences.sequence_list import *
from lxmls import data
from os.path import dirname
import numpy as np  # This is also needed for theano=True

# from nltk.corpus import brown

# Train and test files for english WSJ part of the Penn Tree Bank
data.find('train-02-21.conll')
data.find('dev-22.conll')
data.find('test-23.conll')

# Train and test files for portuguese Floresta sintatica
data.find('pt_train.txt')
pt_dev = ""
data.find('pt_test.txt')


def compacify(train_seq, test_seq, dev_seq, theano=False):
    """
    Create a map for indices that is be compact (do not have unused indices)
    """

    # REDO DICTS
    new_x_dict = LabelDictionary()
    new_y_dict = LabelDictionary(['noun'])
    for corpus_seq in [train_seq, test_seq, dev_seq]:
def corpus_and_sequences():
    corpus = pcc.PostagCorpus()
    train_seq = corpus.read_sequence_list_conll(data.find('train-02-21.conll'), max_sent_len=10, max_nr_sent=1000)
    dev_seq = corpus.read_sequence_list_conll(data.find('dev-22.conll'), max_sent_len=10, max_nr_sent=1000)
    test_seq = corpus.read_sequence_list_conll(data.find('test-23.conll'), max_sent_len=10, max_nr_sent=1000)
    return corpus, train_seq, dev_seq, test_seq