def eval_test(modelPath): global data # data = MultiLabelDataReader(Defaults.input_path).load(index) #model = ltlib.util.load_keras(modelPath) model.load_weights(modelPath + "model.h5") optimizer = get_optimizer(Defaults) print("STARTING TEST") force_oov = set(l.strip() for l in open(Defaults.oov)) if Defaults.oov else None w2v = NormEmbeddingFeature.from_file(Defaults.embedding_path, max_rank=Defaults.max_vocab_size, vocabulary=data.vocabulary, force_oov=force_oov, name='text') # Add word vector features to tokens features = [w2v] data.tokens.add_features(features) # Summarize word vector featurizer statistics (OOV etc.) # logging.info(features[0].summary()) # Create inputs at document level data.documents.add_inputs([ FixedWidthInput(Defaults.doc_size, f['<PADDING>'], f.name) for f in features ]) # Create keras input and embedding for each feature # inputs, embeddings = inputs_and_embeddings(features, Defaults) model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy', f1, prec, rec]) predictions = model.predict(data.test.documents.inputs, batch_size=Defaults.batch_size) # print(str(predictions)) data.test.documents.set_predictions(predictions) print("TEST RESULTS for: " + str(len(predictions))) best_sigmoid = utility.readDictFromStringFile(Defaults.output_path + "out.txt")["best_sigmoid_t"] res = data.test.eval(sigmoid_t=best_sigmoid) res["sigmoid_t"] = best_sigmoid print(str(res)) np.save(Defaults.pred_path + "pred", data.test.get_predictions()) utility.writeDictAsStringFile(res, Defaults.results_path + "res.txt")
def main(argv): config = cli_settings(['datadir', 'wordvecs'], Defaults) data = load_dir(config.datadir, config) force_oov = set(l.strip() for l in open(config.oov)) if config.oov else None w2v = NormEmbeddingFeature.from_file(config.wordvecs, max_rank=config.max_vocab_size, vocabulary=data.vocabulary, force_oov=force_oov, name='text') # Add word vector features to tokens features = [w2v] data.tokens.add_features(features) # Summarize word vector featurizer statistics (OOV etc.) logging.info(features[0].summary()) # Create inputs at document level data.documents.add_inputs([ FixedWidthInput(config.doc_size, f['<PADDING>'], f.name) for f in features ]) # Create keras input and embedding for each feature inputs, embeddings = inputs_and_embeddings(features, config) # Combine and reshape for convolution seq = concat(embeddings) cshape = (config.doc_size, sum(f.output_dim for f in features) ) #calculating the size of documents and all features. seq = Reshape((1, ) + cshape)(seq) #seq = Reshape((1, config.doc_size, w2v.output_dim))(embeddings) #old way of doing the above # Convolution(s) convLayers = [] for filter_size, filter_num in zip(config.filter_sizes, config.filter_nums): seq2 = Convolution2D(filter_num, filter_size, cshape[1], border_mode='valid', activation='relu', dim_ordering='th')(seq) seq2 = MaxPooling2D(pool_size=(config.doc_size - filter_size + 1, 1), dim_ordering='th')(seq2) seq2 = Flatten()(seq2) convLayers.append(seq2) seq = concat(convLayers) if config.drop_prob: seq = Dropout(config.drop_prob)(seq) for s in config.hidden_sizes: seq = Dense(s, activation='relu')(seq) out = Dense(data.documents.target_dim, W_regularizer=W_regularizer(config), activation='softmax')(seq) model = Model(input=inputs, output=out) if config.verbosity != 0: logging.info(model.summary()) optimizer = get_optimizer(config) model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy', f1, prec, rec]) weights, results = [], {} callbacks = [ EpochTimer(), WeightStore(weights), document_evaluator(data.train, label='train', results=results), document_evaluator(data.devel, label='devel', results=results), ] if config.test: callbacks.append( document_evaluator(data.test, label='test', results=results)) hist = model.fit(data.train.documents.inputs, data.train.documents.targets, validation_data=( data.devel.documents.inputs, data.devel.documents.targets, ), batch_size=config.batch_size, nb_epoch=config.epochs, verbose=config.verbosity, callbacks=callbacks) # logging.info(history.history) for k, values in results.items(): s = lambda v: str(v) if not isinstance(v, float) else '{:.4f}'.format(v ) logging.info('\t'.join(s(i) for i in [k] + values)) evalsets = [data.devel] + ([data.test] if config.test else []) for s in evalsets: logging.info('last epoch, {}: {}'.format( s.name, evaluation_summary(model, s, 0, config))) epoch = get_best_epoch(results, 'devel', config) model.set_weights(weights[epoch]) if config.threshold: threshold = results['devel/maxf-threshold'][epoch] else: threshold = 0.0 for s in evalsets: logging.info('best devel epoch th {} ({}), {}: {}'.format( threshold, config.target_metric, s.name, evaluation_summary(model, s, threshold, config)))
def main(argv): global data config = cli_settings(['datadir', 'wordvecs'], Defaults) ##load_dir(config.datadir, config) print("finished reading data") force_oov = set(l.strip() for l in open(config.oov)) if config.oov else None w2v = NormEmbeddingFeature.from_file(config.wordvecs, max_rank=config.max_vocab_size, vocabulary=data.vocabulary, force_oov=force_oov, name='text') # Add word vector features to tokens print("finished reading embeddings") features = [w2v] data.tokens.add_features(features) # Summarize word vector featurizer statistics (OOV etc.) # Create inputs at document level data.documents.add_inputs([ FixedWidthInput(config.doc_size, f['<PADDING>'], f.name) for f in features ]) # Create keras input and embedding for each feature inputs, embeddings = inputs_and_embeddings(features, config) # Combine and reshape for convolution seq = concat(embeddings) cshape = (config.doc_size, sum(f.output_dim for f in features) ) #calculating the size of documents and all features. seq = Reshape((1, ) + cshape)(seq) #seq = Reshape((1, config.doc_size, w2v.output_dim))(embeddings) #old way of doing the above # Convolution(s) convLayers = [] for filter_size, filter_num in zip(config.filter_sizes, config.filter_nums): seq2 = Convolution2D(filter_num, filter_size, cshape[1], border_mode='valid', activation='relu', dim_ordering='th')(seq) seq2 = MaxPooling2D(pool_size=(config.doc_size - filter_size + 1, 1), dim_ordering='th')(seq2) seq2 = Flatten()(seq2) convLayers.append(seq2) seq = concat(convLayers) if config.drop_prob: seq = Dropout(config.drop_prob)(seq) for s in config.hidden_sizes: seq = Dense(s, activation='relu')(seq) out = Dense(data.documents.target_dim, W_regularizer=W_regularizer(config), activation='sigmoid')(seq) model = Model(input=inputs, output=out) optimizer = get_optimizer(config) model.compile(loss='categorical_crossentropy', optimizer=optimizer #metrics=['accuracy', f1, prec, rec] ) weights, results = [], {} callbacks = [ EpochTimer(), #WeightStore(weights), #document_evaluator(data.train, label='train', results=results), evaluator(data.devel, label='devel', results=results) ] #if config.test: #callbacks.append(document_evaluator(data.test, label='test', # results=results)) hist = model.fit(data.train.documents.inputs, data.train.documents.targets, validation_data=( data.devel.documents.inputs, data.devel.documents.targets, ), batch_size=config.batch_size, nb_epoch=config.epochs, verbose=config.verbosity, callbacks=callbacks)