name='words') features = [w2v] if config.word_features: features.append(SennaCapsFeature('caps')) for ind, dataset in enumerate(datasets): data[ind].tokens.add_features(features) data[ind].tokens.add_inputs(windowed_inputs(config.window_size, features)) # Log word vector feature stat summary info('{}: {}'.format(config.wordvecs, w2v.summary())) inputs, embeddings = inputs_and_embeddings(features, config) # Combine and reshape for convolution seq = concat(embeddings) cshape = (config.window_size, sum(f.output_dim for f in features)) seq = Reshape((1,) + cshape)(seq) # Convolutions conv_outputs = [] for filter_size, filter_num in zip(config.filter_sizes, config.filter_nums): conv = Convolution2D(filter_num, filter_size, cshape[1],activation='relu')(seq) cout = Flatten()(conv) conv_outputs.append(cout) seq = concat(conv_outputs) for size in config.hidden_sizes: seq = Dense(size, activation=config.hidden_activation)(seq) seq = Dropout(config.output_drop_prob)(seq)
max_rank=config.max_vocab_size, vocabulary=data.vocabulary, name='words') features = [w2v] if config.word_features: features.append(SennaCapsFeature(name='caps')) data.tokens.add_features(features) data.tokens.add_inputs(windowed_inputs(config.window_size, features)) # Log word vector feature stat summary info('{}: {}'.format(config.wordvecs, w2v.summary())) inputs, embeddings = inputs_and_embeddings(features, config) seq = concat(embeddings) seq = Flatten()(seq) for size in config.hidden_sizes: seq = Dense(size, activation=config.hidden_activation)(seq) seq = Dropout(config.output_drop_prob)(seq) out = Dense(data.tokens.target_dim, activation='softmax')(seq) model = Model(input=inputs, output=out) optimizer = get_optimizer(config) model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy']) callbacks = [ EpochTimer(), token_evaluator(data.train, config=config),
def main(argv): config = cli_settings(['datadir', 'wordvecs'], Defaults) data = load_dir(config.datadir, config) force_oov = set(l.strip() for l in open(config.oov)) if config.oov else None w2v = NormEmbeddingFeature.from_file(config.wordvecs, max_rank=config.max_vocab_size, vocabulary=data.vocabulary, force_oov=force_oov, name='text') # Add word vector features to tokens features = [w2v] data.tokens.add_features(features) # Summarize word vector featurizer statistics (OOV etc.) logging.info(features[0].summary()) # Create inputs at document level data.documents.add_inputs([ FixedWidthInput(config.doc_size, f['<PADDING>'], f.name) for f in features ]) # Create keras input and embedding for each feature inputs, embeddings = inputs_and_embeddings(features, config) # Combine and reshape for convolution seq = concat(embeddings) cshape = (config.doc_size, sum(f.output_dim for f in features) ) #calculating the size of documents and all features. seq = Reshape((1, ) + cshape)(seq) #seq = Reshape((1, config.doc_size, w2v.output_dim))(embeddings) #old way of doing the above # Convolution(s) convLayers = [] for filter_size, filter_num in zip(config.filter_sizes, config.filter_nums): seq2 = Convolution2D(filter_num, filter_size, cshape[1], border_mode='valid', activation='relu', dim_ordering='th')(seq) seq2 = MaxPooling2D(pool_size=(config.doc_size - filter_size + 1, 1), dim_ordering='th')(seq2) seq2 = Flatten()(seq2) convLayers.append(seq2) seq = concat(convLayers) if config.drop_prob: seq = Dropout(config.drop_prob)(seq) for s in config.hidden_sizes: seq = Dense(s, activation='relu')(seq) out = Dense(data.documents.target_dim, W_regularizer=W_regularizer(config), activation='softmax')(seq) model = Model(input=inputs, output=out) if config.verbosity != 0: logging.info(model.summary()) optimizer = get_optimizer(config) model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy', f1, prec, rec]) weights, results = [], {} callbacks = [ EpochTimer(), WeightStore(weights), document_evaluator(data.train, label='train', results=results), document_evaluator(data.devel, label='devel', results=results), ] if config.test: callbacks.append( document_evaluator(data.test, label='test', results=results)) hist = model.fit(data.train.documents.inputs, data.train.documents.targets, validation_data=( data.devel.documents.inputs, data.devel.documents.targets, ), batch_size=config.batch_size, nb_epoch=config.epochs, verbose=config.verbosity, callbacks=callbacks) # logging.info(history.history) for k, values in results.items(): s = lambda v: str(v) if not isinstance(v, float) else '{:.4f}'.format(v ) logging.info('\t'.join(s(i) for i in [k] + values)) evalsets = [data.devel] + ([data.test] if config.test else []) for s in evalsets: logging.info('last epoch, {}: {}'.format( s.name, evaluation_summary(model, s, 0, config))) epoch = get_best_epoch(results, 'devel', config) model.set_weights(weights[epoch]) if config.threshold: threshold = results['devel/maxf-threshold'][epoch] else: threshold = 0.0 for s in evalsets: logging.info('best devel epoch th {} ({}), {}: {}'.format( threshold, config.target_metric, s.name, evaluation_summary(model, s, threshold, config)))
name='words') features = [w2v] if config.word_features: features.append(SennaCapsFeature('caps')) for ind, dataset in enumerate(datasets): data[ind].tokens.add_features(features) data[ind].tokens.add_inputs(windowed_inputs(config.window_size, features)) # Log word vector feature stat summary info('{}: {}'.format(config.wordvecs, w2v.summary())) inputs, embeddings = inputs_and_embeddings(features, config) # Combine and reshape for convolution seq = concat(embeddings) cshape = (config.window_size, sum(f.output_dim for f in features)) seq = Reshape((1, ) + cshape)(seq) # Convolutions conv_outputs = [] for filter_size, filter_num in zip(config.filter_sizes, config.filter_nums): conv = Convolution2D(filter_num, filter_size, cshape[1], activation='relu')(seq) cout = Flatten()(conv) conv_outputs.append(cout) seq = concat(conv_outputs) for size in config.hidden_sizes: seq = Dense(size, activation=config.hidden_activation)(seq) seq = Dropout(config.output_drop_prob)(seq)
def main(argv): global data config = cli_settings(['datadir', 'wordvecs'], Defaults) ##load_dir(config.datadir, config) print("finished reading data") force_oov = set(l.strip() for l in open(config.oov)) if config.oov else None w2v = NormEmbeddingFeature.from_file(config.wordvecs, max_rank=config.max_vocab_size, vocabulary=data.vocabulary, force_oov=force_oov, name='text') # Add word vector features to tokens print("finished reading embeddings") features = [w2v] data.tokens.add_features(features) # Summarize word vector featurizer statistics (OOV etc.) # Create inputs at document level data.documents.add_inputs([ FixedWidthInput(config.doc_size, f['<PADDING>'], f.name) for f in features ]) # Create keras input and embedding for each feature inputs, embeddings = inputs_and_embeddings(features, config) # Combine and reshape for convolution seq = concat(embeddings) cshape = (config.doc_size, sum(f.output_dim for f in features) ) #calculating the size of documents and all features. seq = Reshape((1, ) + cshape)(seq) #seq = Reshape((1, config.doc_size, w2v.output_dim))(embeddings) #old way of doing the above # Convolution(s) convLayers = [] for filter_size, filter_num in zip(config.filter_sizes, config.filter_nums): seq2 = Convolution2D(filter_num, filter_size, cshape[1], border_mode='valid', activation='relu', dim_ordering='th')(seq) seq2 = MaxPooling2D(pool_size=(config.doc_size - filter_size + 1, 1), dim_ordering='th')(seq2) seq2 = Flatten()(seq2) convLayers.append(seq2) seq = concat(convLayers) if config.drop_prob: seq = Dropout(config.drop_prob)(seq) for s in config.hidden_sizes: seq = Dense(s, activation='relu')(seq) out = Dense(data.documents.target_dim, W_regularizer=W_regularizer(config), activation='sigmoid')(seq) model = Model(input=inputs, output=out) optimizer = get_optimizer(config) model.compile(loss='categorical_crossentropy', optimizer=optimizer #metrics=['accuracy', f1, prec, rec] ) weights, results = [], {} callbacks = [ EpochTimer(), #WeightStore(weights), #document_evaluator(data.train, label='train', results=results), evaluator(data.devel, label='devel', results=results) ] #if config.test: #callbacks.append(document_evaluator(data.test, label='test', # results=results)) hist = model.fit(data.train.documents.inputs, data.train.documents.targets, validation_data=( data.devel.documents.inputs, data.devel.documents.targets, ), batch_size=config.batch_size, nb_epoch=config.epochs, verbose=config.verbosity, callbacks=callbacks)
vocabulary=all_vocab, name='words-%s' % ind) features = [w2v] if config.word_features: features.append(SennaCapsFeature('caps')) data[ind].tokens.add_features(features) data[ind].tokens.add_inputs(windowed_inputs(config.window_size, features)) # Log word vector feature stat summary info('{}: {}'.format(config.wordvecs, w2v.summary())) if ind == 0: pos_inputs, pos_embeddings = inputs_and_embeddings(features, config) pos_x = concat(pos_embeddings) if ind == 1: ner_inputs, ner_embeddings = inputs_and_embeddings(features, config) ner_x = concat(ner_embeddings) cshapes = [] reshapes = [] # Combine and reshape for convolution pos_cshape = (config.window_size, sum(f.output_dim for f in features)) ner_cshape = (config.window_size, sum(f.output_dim for f in features)) cshapes.append(pos_cshape) cshapes.append(ner_cshape) pos_reshape = Reshape((1,) + (pos_cshape), name='pos-reshape')(pos_x) ner_reshape = Reshape((1,) + (ner_cshape), name='ner-reshape')(ner_x) reshapes.append(pos_reshape)