def eval_test(modelPath):
    global data
    # data = MultiLabelDataReader(Defaults.input_path).load(index)
    #model = ltlib.util.load_keras(modelPath)
    model.load_weights(modelPath + "model.h5")
    optimizer = get_optimizer(Defaults)

    print("STARTING TEST")

    force_oov = set(l.strip()
                    for l in open(Defaults.oov)) if Defaults.oov else None
    w2v = NormEmbeddingFeature.from_file(Defaults.embedding_path,
                                         max_rank=Defaults.max_vocab_size,
                                         vocabulary=data.vocabulary,
                                         force_oov=force_oov,
                                         name='text')
    # Add word vector features to tokens

    features = [w2v]
    data.tokens.add_features(features)
    # Summarize word vector featurizer statistics (OOV etc.)
    #    logging.info(features[0].summary())
    # Create inputs at document level
    data.documents.add_inputs([
        FixedWidthInput(Defaults.doc_size, f['<PADDING>'], f.name)
        for f in features
    ])

    # Create keras input and embedding for each feature
    # inputs, embeddings = inputs_and_embeddings(features, Defaults)

    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy', f1, prec, rec])
    predictions = model.predict(data.test.documents.inputs,
                                batch_size=Defaults.batch_size)
    # print(str(predictions))
    data.test.documents.set_predictions(predictions)
    print("TEST RESULTS for: " + str(len(predictions)))
    best_sigmoid = utility.readDictFromStringFile(Defaults.output_path +
                                                  "out.txt")["best_sigmoid_t"]
    res = data.test.eval(sigmoid_t=best_sigmoid)
    res["sigmoid_t"] = best_sigmoid
    print(str(res))
    np.save(Defaults.pred_path + "pred", data.test.get_predictions())
    utility.writeDictAsStringFile(res, Defaults.results_path + "res.txt")
    conv_outputs.append(cout)
seq = concat(conv_outputs)

for size in config.hidden_sizes:
    seq = Dense(size, activation=config.hidden_activation)(seq)
seq = Dropout(config.output_drop_prob)(seq)

#Create private outputs
outs = []
for ind, dataset in enumerate(datasets):
    #outs.append(Dense(data[ind].tokens.target_dim, activation='softmax')(seq))

    outs.append(Dense(max_y, activation='softmax')(seq))

model = Model(input=inputs, output=outs)
optimizer = get_optimizer(config)

model.compile(loss=['categorical_crossentropy']* len(datasets), optimizer=optimizer, metrics=['accuracy'])
#model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

x_batch = []
y_batch = []
concatenated = True
for ind, ds in enumerate(data):
    x_batch.append(ds.train.tokens.inputs['words'])
    
    #out_labels = [np.zeros(data[ind_].train.tokens.targets.shape) for ind_, dataset in enumerate(datasets)]
    #out_labels[ind] = data[ind].train.tokens.targets
    
    out_labels = [np.zeros((data[ind_].train.tokens.targets.shape[0], max_y)) for ind_, dataset in enumerate(datasets)]
    y_ = data[ind].train.tokens.targets
data.tokens.add_inputs(windowed_inputs(config.window_size, features))

# Log word vector feature stat summary
info('{}: {}'.format(config.wordvecs, w2v.summary()))

inputs, embeddings = inputs_and_embeddings(features, config)

seq = concat(embeddings)
seq = Flatten()(seq)
for size in config.hidden_sizes:
    seq = Dense(size, activation=config.hidden_activation)(seq)
seq = Dropout(config.output_drop_prob)(seq)
out = Dense(data.tokens.target_dim, activation='softmax')(seq)
model = Model(input=inputs, output=out)

optimizer = get_optimizer(config)
model.compile(loss='categorical_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])

callbacks = [
    EpochTimer(),
    token_evaluator(data.train, config=config),
    token_evaluator(data.test, mapper=vmapper, config=config),
]

model.fit(data.train.tokens.inputs,
          data.train.tokens.targets,
          callbacks=callbacks,
          batch_size=config.batch_size,
          nb_epoch=config.epochs,
예제 #4
0
def main(argv):
    config = cli_settings(['datadir', 'wordvecs'], Defaults)
    data = load_dir(config.datadir, config)

    force_oov = set(l.strip()
                    for l in open(config.oov)) if config.oov else None
    w2v = NormEmbeddingFeature.from_file(config.wordvecs,
                                         max_rank=config.max_vocab_size,
                                         vocabulary=data.vocabulary,
                                         force_oov=force_oov,
                                         name='text')
    # Add word vector features to tokens
    features = [w2v]
    data.tokens.add_features(features)
    # Summarize word vector featurizer statistics (OOV etc.)
    logging.info(features[0].summary())
    # Create inputs at document level
    data.documents.add_inputs([
        FixedWidthInput(config.doc_size, f['<PADDING>'], f.name)
        for f in features
    ])

    # Create keras input and embedding for each feature
    inputs, embeddings = inputs_and_embeddings(features, config)

    # Combine and reshape for convolution
    seq = concat(embeddings)
    cshape = (config.doc_size, sum(f.output_dim for f in features)
              )  #calculating the size of documents and all features.
    seq = Reshape((1, ) + cshape)(seq)
    #seq = Reshape((1, config.doc_size, w2v.output_dim))(embeddings) #old way of doing the above

    # Convolution(s)
    convLayers = []
    for filter_size, filter_num in zip(config.filter_sizes,
                                       config.filter_nums):
        seq2 = Convolution2D(filter_num,
                             filter_size,
                             cshape[1],
                             border_mode='valid',
                             activation='relu',
                             dim_ordering='th')(seq)
        seq2 = MaxPooling2D(pool_size=(config.doc_size - filter_size + 1, 1),
                            dim_ordering='th')(seq2)
        seq2 = Flatten()(seq2)
        convLayers.append(seq2)

    seq = concat(convLayers)
    if config.drop_prob:
        seq = Dropout(config.drop_prob)(seq)
    for s in config.hidden_sizes:
        seq = Dense(s, activation='relu')(seq)
    out = Dense(data.documents.target_dim,
                W_regularizer=W_regularizer(config),
                activation='softmax')(seq)
    model = Model(input=inputs, output=out)

    if config.verbosity != 0:
        logging.info(model.summary())

    optimizer = get_optimizer(config)
    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy', f1, prec, rec])

    weights, results = [], {}
    callbacks = [
        EpochTimer(),
        WeightStore(weights),
        document_evaluator(data.train, label='train', results=results),
        document_evaluator(data.devel, label='devel', results=results),
    ]
    if config.test:
        callbacks.append(
            document_evaluator(data.test, label='test', results=results))

    hist = model.fit(data.train.documents.inputs,
                     data.train.documents.targets,
                     validation_data=(
                         data.devel.documents.inputs,
                         data.devel.documents.targets,
                     ),
                     batch_size=config.batch_size,
                     nb_epoch=config.epochs,
                     verbose=config.verbosity,
                     callbacks=callbacks)
    # logging.info(history.history)

    for k, values in results.items():
        s = lambda v: str(v) if not isinstance(v, float) else '{:.4f}'.format(v
                                                                              )
        logging.info('\t'.join(s(i) for i in [k] + values))

    evalsets = [data.devel] + ([data.test] if config.test else [])
    for s in evalsets:
        logging.info('last epoch, {}: {}'.format(
            s.name, evaluation_summary(model, s, 0, config)))
    epoch = get_best_epoch(results, 'devel', config)
    model.set_weights(weights[epoch])
    if config.threshold:
        threshold = results['devel/maxf-threshold'][epoch]
    else:
        threshold = 0.0
    for s in evalsets:
        logging.info('best devel epoch th {} ({}), {}: {}'.format(
            threshold, config.target_metric, s.name,
            evaluation_summary(model, s, threshold, config)))
예제 #5
0
def main(argv):
    global data
    config = cli_settings(['datadir', 'wordvecs'], Defaults)
    ##load_dir(config.datadir, config)

    print("finished reading data")
    force_oov = set(l.strip()
                    for l in open(config.oov)) if config.oov else None
    w2v = NormEmbeddingFeature.from_file(config.wordvecs,
                                         max_rank=config.max_vocab_size,
                                         vocabulary=data.vocabulary,
                                         force_oov=force_oov,
                                         name='text')
    # Add word vector features to tokens
    print("finished reading embeddings")
    features = [w2v]
    data.tokens.add_features(features)
    # Summarize word vector featurizer statistics (OOV etc.)

    # Create inputs at document level
    data.documents.add_inputs([
        FixedWidthInput(config.doc_size, f['<PADDING>'], f.name)
        for f in features
    ])

    # Create keras input and embedding for each feature
    inputs, embeddings = inputs_and_embeddings(features, config)

    # Combine and reshape for convolution
    seq = concat(embeddings)
    cshape = (config.doc_size, sum(f.output_dim for f in features)
              )  #calculating the size of documents and all features.
    seq = Reshape((1, ) + cshape)(seq)
    #seq = Reshape((1, config.doc_size, w2v.output_dim))(embeddings) #old way of doing the above

    # Convolution(s)
    convLayers = []
    for filter_size, filter_num in zip(config.filter_sizes,
                                       config.filter_nums):
        seq2 = Convolution2D(filter_num,
                             filter_size,
                             cshape[1],
                             border_mode='valid',
                             activation='relu',
                             dim_ordering='th')(seq)
        seq2 = MaxPooling2D(pool_size=(config.doc_size - filter_size + 1, 1),
                            dim_ordering='th')(seq2)
        seq2 = Flatten()(seq2)
        convLayers.append(seq2)

    seq = concat(convLayers)
    if config.drop_prob:
        seq = Dropout(config.drop_prob)(seq)
    for s in config.hidden_sizes:
        seq = Dense(s, activation='relu')(seq)
    out = Dense(data.documents.target_dim,
                W_regularizer=W_regularizer(config),
                activation='sigmoid')(seq)
    model = Model(input=inputs, output=out)

    optimizer = get_optimizer(config)
    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer
                  #metrics=['accuracy', f1, prec, rec]
                  )

    weights, results = [], {}
    callbacks = [
        EpochTimer(),
        #WeightStore(weights),
        #document_evaluator(data.train, label='train', results=results),
        evaluator(data.devel, label='devel', results=results)
    ]
    #if config.test:
    #callbacks.append(document_evaluator(data.test, label='test',
    #                                       results=results))

    hist = model.fit(data.train.documents.inputs,
                     data.train.documents.targets,
                     validation_data=(
                         data.devel.documents.inputs,
                         data.devel.documents.targets,
                     ),
                     batch_size=config.batch_size,
                     nb_epoch=config.epochs,
                     verbose=config.verbosity,
                     callbacks=callbacks)
예제 #6
0
        
    seq = concat(conv_outputs[ind])
      
    for size in config.hidden_sizes:
        fully_connected.append(Dense(size, activation=config.hidden_activation, name='dense-1-%d' % ind)(seq))
    dropout.append(Dropout(config.output_drop_prob, name='dropout-%d' % ind)(fully_connected[ind]))

pos_dense_out = Dense(data[0].tokens.target_dim, activation='softmax', name='pos-dense-out')(dropout[0])

ner_merged = merge([dropout[0], dropout[1]], mode='concat')
ner_dense_out = Dense(data[1].tokens.target_dim, activation='softmax', name='ner-dense-out')(ner_merged)

pos_model = Model(input=pos_inputs, output=pos_dense_out)
ner_model = Model(input=pos_inputs + ner_inputs, output=ner_dense_out)

pos_model.compile(optimizer=get_optimizer(config), loss='categorical_crossentropy', metrics=['accuracy'])
ner_model.compile(optimizer=get_optimizer(config), loss='categorical_crossentropy', metrics=['accuracy'])

models = [pos_model, ner_model]

time_str = datetime.datetime.now().isoformat()
print("Started training at: %s" % time_str)

for ind, ds in enumerate(data):
    for ep in range(1, config.epochs + 1):
        percnt_keep = config.percent_keep
        amt_keep = len(ds.train.tokens.inputs['words-%s' % ind]) * percnt_keep
        print("Total: %s. Keeping: %s" % (len(ds.train.tokens.inputs['words-%s' % ind]), amt_keep))
        start = random.randrange(int(len(ds.train.tokens.inputs['words-%s' % ind]) - amt_keep) + 1)
        end = int(start + amt_keep)
        x = ds.train.tokens.inputs['words-%s' % ind][start:end]