from ltlib import filelog
from ltlib import conlldata
from ltlib import viterbi

from ltlib.features import NormEmbeddingFeature, SennaCapsFeature
from ltlib.features import windowed_inputs
from ltlib.callbacks import token_evaluator, EpochTimer
from ltlib.layers import concat, inputs_and_embeddings
from ltlib.settings import cli_settings, log_settings
from ltlib.optimizers import get_optimizer
from ltlib.util import unique
from ltlib.output import save_token_predictions_multi_output

from config import Defaults

config = cli_settings(['datadir', 'datasets','wordvecs'], Defaults)
assert len(config.filter_nums) == len(config.filter_sizes)

datasets = config.datasets.split(',')

data = []
max_fs = []
max_vfs = []

for ind, dataset in enumerate(datasets):
    data_path = config.datadir + '/' + dataset
    data.append(conlldata.load_dir(data_path, config))
    max_fs.append((0.0,0.0))
    max_vfs.append((0.0,0.0))
    
max_y = 0
Exemplo n.º 2
0
def main(argv):
    config = cli_settings(['datadir', 'wordvecs'], Defaults)
    data = load_dir(config.datadir, config)

    force_oov = set(l.strip()
                    for l in open(config.oov)) if config.oov else None
    w2v = NormEmbeddingFeature.from_file(config.wordvecs,
                                         max_rank=config.max_vocab_size,
                                         vocabulary=data.vocabulary,
                                         force_oov=force_oov,
                                         name='text')
    # Add word vector features to tokens
    features = [w2v]
    data.tokens.add_features(features)
    # Summarize word vector featurizer statistics (OOV etc.)
    logging.info(features[0].summary())
    # Create inputs at document level
    data.documents.add_inputs([
        FixedWidthInput(config.doc_size, f['<PADDING>'], f.name)
        for f in features
    ])

    # Create keras input and embedding for each feature
    inputs, embeddings = inputs_and_embeddings(features, config)

    # Combine and reshape for convolution
    seq = concat(embeddings)
    cshape = (config.doc_size, sum(f.output_dim for f in features)
              )  #calculating the size of documents and all features.
    seq = Reshape((1, ) + cshape)(seq)
    #seq = Reshape((1, config.doc_size, w2v.output_dim))(embeddings) #old way of doing the above

    # Convolution(s)
    convLayers = []
    for filter_size, filter_num in zip(config.filter_sizes,
                                       config.filter_nums):
        seq2 = Convolution2D(filter_num,
                             filter_size,
                             cshape[1],
                             border_mode='valid',
                             activation='relu',
                             dim_ordering='th')(seq)
        seq2 = MaxPooling2D(pool_size=(config.doc_size - filter_size + 1, 1),
                            dim_ordering='th')(seq2)
        seq2 = Flatten()(seq2)
        convLayers.append(seq2)

    seq = concat(convLayers)
    if config.drop_prob:
        seq = Dropout(config.drop_prob)(seq)
    for s in config.hidden_sizes:
        seq = Dense(s, activation='relu')(seq)
    out = Dense(data.documents.target_dim,
                W_regularizer=W_regularizer(config),
                activation='softmax')(seq)
    model = Model(input=inputs, output=out)

    if config.verbosity != 0:
        logging.info(model.summary())

    optimizer = get_optimizer(config)
    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy', f1, prec, rec])

    weights, results = [], {}
    callbacks = [
        EpochTimer(),
        WeightStore(weights),
        document_evaluator(data.train, label='train', results=results),
        document_evaluator(data.devel, label='devel', results=results),
    ]
    if config.test:
        callbacks.append(
            document_evaluator(data.test, label='test', results=results))

    hist = model.fit(data.train.documents.inputs,
                     data.train.documents.targets,
                     validation_data=(
                         data.devel.documents.inputs,
                         data.devel.documents.targets,
                     ),
                     batch_size=config.batch_size,
                     nb_epoch=config.epochs,
                     verbose=config.verbosity,
                     callbacks=callbacks)
    # logging.info(history.history)

    for k, values in results.items():
        s = lambda v: str(v) if not isinstance(v, float) else '{:.4f}'.format(v
                                                                              )
        logging.info('\t'.join(s(i) for i in [k] + values))

    evalsets = [data.devel] + ([data.test] if config.test else [])
    for s in evalsets:
        logging.info('last epoch, {}: {}'.format(
            s.name, evaluation_summary(model, s, 0, config)))
    epoch = get_best_epoch(results, 'devel', config)
    model.set_weights(weights[epoch])
    if config.threshold:
        threshold = results['devel/maxf-threshold'][epoch]
    else:
        threshold = 0.0
    for s in evalsets:
        logging.info('best devel epoch th {} ({}), {}: {}'.format(
            threshold, config.target_metric, s.name,
            evaluation_summary(model, s, threshold, config)))
from ltlib import filelog
from ltlib import conlldata
from ltlib import viterbi

from ltlib.features import NormEmbeddingFeature, SennaCapsFeature
from ltlib.features import windowed_inputs
from ltlib.callbacks import token_evaluator, EpochTimer
from ltlib.layers import concat, inputs_and_embeddings
from ltlib.settings import cli_settings, log_settings
from ltlib.optimizers import get_optimizer
from ltlib.output import save_token_predictions

from baseline_config import Defaults

config = cli_settings(['datadir', 'wordvecs'], Defaults)

data = conlldata.load_dir(config.datadir, config)

vmapper = viterbi.get_prediction_mapper(data.train.sentences, config)

w2v = NormEmbeddingFeature.from_file(config.wordvecs,
                                     max_rank=config.max_vocab_size,
                                     vocabulary=data.vocabulary,
                                     name='words')
features = [w2v]
if config.word_features:
    features.append(SennaCapsFeature(name='caps'))

data.tokens.add_features(features)
data.tokens.add_inputs(windowed_inputs(config.window_size, features))
Exemplo n.º 4
0
def main(argv):
    global data
    config = cli_settings(['datadir', 'wordvecs'], Defaults)
    ##load_dir(config.datadir, config)

    print("finished reading data")
    force_oov = set(l.strip()
                    for l in open(config.oov)) if config.oov else None
    w2v = NormEmbeddingFeature.from_file(config.wordvecs,
                                         max_rank=config.max_vocab_size,
                                         vocabulary=data.vocabulary,
                                         force_oov=force_oov,
                                         name='text')
    # Add word vector features to tokens
    print("finished reading embeddings")
    features = [w2v]
    data.tokens.add_features(features)
    # Summarize word vector featurizer statistics (OOV etc.)

    # Create inputs at document level
    data.documents.add_inputs([
        FixedWidthInput(config.doc_size, f['<PADDING>'], f.name)
        for f in features
    ])

    # Create keras input and embedding for each feature
    inputs, embeddings = inputs_and_embeddings(features, config)

    # Combine and reshape for convolution
    seq = concat(embeddings)
    cshape = (config.doc_size, sum(f.output_dim for f in features)
              )  #calculating the size of documents and all features.
    seq = Reshape((1, ) + cshape)(seq)
    #seq = Reshape((1, config.doc_size, w2v.output_dim))(embeddings) #old way of doing the above

    # Convolution(s)
    convLayers = []
    for filter_size, filter_num in zip(config.filter_sizes,
                                       config.filter_nums):
        seq2 = Convolution2D(filter_num,
                             filter_size,
                             cshape[1],
                             border_mode='valid',
                             activation='relu',
                             dim_ordering='th')(seq)
        seq2 = MaxPooling2D(pool_size=(config.doc_size - filter_size + 1, 1),
                            dim_ordering='th')(seq2)
        seq2 = Flatten()(seq2)
        convLayers.append(seq2)

    seq = concat(convLayers)
    if config.drop_prob:
        seq = Dropout(config.drop_prob)(seq)
    for s in config.hidden_sizes:
        seq = Dense(s, activation='relu')(seq)
    out = Dense(data.documents.target_dim,
                W_regularizer=W_regularizer(config),
                activation='sigmoid')(seq)
    model = Model(input=inputs, output=out)

    optimizer = get_optimizer(config)
    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer
                  #metrics=['accuracy', f1, prec, rec]
                  )

    weights, results = [], {}
    callbacks = [
        EpochTimer(),
        #WeightStore(weights),
        #document_evaluator(data.train, label='train', results=results),
        evaluator(data.devel, label='devel', results=results)
    ]
    #if config.test:
    #callbacks.append(document_evaluator(data.test, label='test',
    #                                       results=results))

    hist = model.fit(data.train.documents.inputs,
                     data.train.documents.targets,
                     validation_data=(
                         data.devel.documents.inputs,
                         data.devel.documents.targets,
                     ),
                     batch_size=config.batch_size,
                     nb_epoch=config.epochs,
                     verbose=config.verbosity,
                     callbacks=callbacks)