all_vocab = set() for ind, dataset in enumerate(datasets): all_vocab = set().union(*[all_vocab, data[ind].vocabulary]) w2v = NormEmbeddingFeature.from_file(config.wordvecs, max_rank=config.max_vocab_size, vocabulary=all_vocab, name='words') features = [w2v] if config.word_features: features.append(SennaCapsFeature('caps')) for ind, dataset in enumerate(datasets): data[ind].tokens.add_features(features) data[ind].tokens.add_inputs(windowed_inputs(config.window_size, features)) # Log word vector feature stat summary info('{}: {}'.format(config.wordvecs, w2v.summary())) inputs, embeddings = inputs_and_embeddings(features, config) # Combine and reshape for convolution seq = concat(embeddings) cshape = (config.window_size, sum(f.output_dim for f in features)) seq = Reshape((1,) + cshape)(seq) # Convolutions conv_outputs = [] for filter_size, filter_num in zip(config.filter_sizes, config.filter_nums): conv = Convolution2D(filter_num, filter_size, cshape[1],activation='relu')(seq)
config = cli_settings(['datadir', 'wordvecs'], Defaults) data = conlldata.load_dir(config.datadir, config) vmapper = viterbi.get_prediction_mapper(data.train.sentences, config) w2v = NormEmbeddingFeature.from_file(config.wordvecs, max_rank=config.max_vocab_size, vocabulary=data.vocabulary, name='words') features = [w2v] if config.word_features: features.append(SennaCapsFeature(name='caps')) data.tokens.add_features(features) data.tokens.add_inputs(windowed_inputs(config.window_size, features)) # Log word vector feature stat summary info('{}: {}'.format(config.wordvecs, w2v.summary())) inputs, embeddings = inputs_and_embeddings(features, config) seq = concat(embeddings) seq = Flatten()(seq) for size in config.hidden_sizes: seq = Dense(size, activation=config.hidden_activation)(seq) seq = Dropout(config.output_drop_prob)(seq) out = Dense(data.tokens.target_dim, activation='softmax')(seq) model = Model(input=inputs, output=out) optimizer = get_optimizer(config)