def build_model_and_objective(n_classes, n_input_dimensions, X, Y): model = CSM( layers=[ Softmax( n_classes=n_classes, n_input_dimensions=n_input_dimensions), ], ) lengths = np.zeros(X.shape[0]) data_provider = BatchDataProvider( X=X, Y=Y, lengths=lengths) cost_function = CrossEntropy() objective = CostMinimizationObjective(cost=cost_function, data_provider=data_provider) update_rule = AdaGrad( gamma=0.1, model_template=model) optimizer = SGD(model=model, objective=objective, update_rule=update_rule) return model, objective, optimizer, data_provider
def get_model(encoding): return CSM( layers=[ DictionaryEncoding(vocabulary=encoding), WordEmbedding( dimension={{embedding_dimension}}, vocabulary_size=len(encoding), padding=encoding['PADDING']), {% for layer in word_layers %} {% set layer_index = loop.index0 %} SentenceConvolution( n_feature_maps={{layer.n_feature_maps}}, kernel_width={{layer.kernel_width}}, n_channels={{layer.n_channels}}, n_input_dimensions=1), Bias( n_input_dims=1, n_feature_maps={{layer.n_feature_maps}}), KMaxPooling(k={{layer.k_pooling}}, k_dynamic={{layer.k_dynamic}} if {{layer.k_dynamic}} > 0 else None), {{layer.nonlinearity}}(), {% endfor %} ReshapeForDocuments(), {% for layer in sentence_layers %} {% set layer_index = loop.index0 %} SentenceConvolution( n_feature_maps={{layer.n_feature_maps}}, kernel_width={{layer.kernel_width}}, n_channels={{layer.n_channels}}, n_input_dimensions=1), Bias( n_input_dims=1, n_feature_maps={{layer.n_feature_maps}}), KMaxPooling(k={{layer.k_pooling}}, k_dynamic={{layer.k_dynamic}} if {{layer.k_dynamic}} > 0 else None), {{layer.nonlinearity}}(), {% endfor %} {% if dropout %} Dropout(('b', 'd', 'f', 'w'), 0.5), {% endif %} Softmax( n_classes={{n_classes}}, n_input_dimensions={{softmax_input_dimensions}}), ])
def model_one_layer_variant_2(alphabet): return CSM(layers=[ DictionaryEncoding(vocabulary=alphabet), WordEmbedding(dimension=42, vocabulary_size=len(alphabet)), SentenceConvolution(n_feature_maps=5, kernel_width=6, n_channels=1, n_input_dimensions=42), SumFolding(), KMaxPooling(k=4), Bias(n_input_dims=21, n_feature_maps=5), Tanh(), Softmax(n_classes=2, n_input_dimensions=420), ])
def remove_dropout(model): new_model = [] ratio = 0 for layer in model.layers: if layer.__class__.__name__ == 'Dropout': ratio = layer.dropout_rate else: if ratio == 0: new_model.append(layer) else: new_model.append(__function_mapping[layer.__class__.__name__]( layer, ratio)) ratio = 0 return CSM(layers=new_model)
def model_one_layer_large_embedding(alphabet): return CSM(layers=[ DictionaryEncoding(vocabulary=alphabet), WordEmbedding(dimension=32 * 4, vocabulary_size=len(alphabet)), SentenceConvolution(n_feature_maps=5, kernel_width=10, n_channels=1, n_input_dimensions=32 * 4), Relu(), SumFolding(), SumFolding(), SumFolding(), KMaxPooling(k=7), Bias(n_input_dims=16, n_feature_maps=5), Tanh(), MaxFolding(), Softmax(n_classes=2, n_input_dimensions=280), ])
def txtnets_model_from_gensim_word2vec(gensim_model): # build vocabulary mapping encoding = {} for index, word in enumerate(gensim_model.index2word): encoding[word] = index encoding['PADDING'] = len(encoding) vocabulary_size = len(encoding) embedding_dim = gensim_model.syn0.shape[1] E = np.concatenate( [gensim_model.syn0, np.zeros((1, embedding_dim))], axis=0) txtnets_model = CSM(layers=[ DictionaryEncoding(vocabulary=encoding), WordEmbedding( vocabulary_size=vocabulary_size, dimension=embedding_dim, padding=encoding['PADDING'], E=E, ) ]) return txtnets_model
tweet_model = CSM( layers=[ # cpu.model.encoding. DictionaryEncoding(vocabulary=alphabet), # cpu.model.embedding. WordEmbedding( dimension=28, vocabulary_size=len(alphabet)), # HostToDevice(), SentenceConvolution( n_feature_maps=6, kernel_width=7, n_channels=1, n_input_dimensions=28), Bias( n_input_dims=28, n_feature_maps=6), SumFolding(), KMaxPooling(k=4, k_dynamic=0.5), Tanh(), SentenceConvolution( n_feature_maps=14, kernel_width=5, n_channels=6, n_input_dimensions=14), Bias( n_input_dims=14, n_feature_maps=14), SumFolding(), KMaxPooling(k=4), Tanh(), Softmax( n_classes=2, n_input_dimensions=392), ] )
def main(): random.seed(34532) np.random.seed(675) np.set_printoptions(linewidth=100) data_dir = os.path.join("/users/mdenil/code/txtnets/txtnets_deployed/data", "stanfordmovie") trainer = Word2Vec( train=os.path.join(data_dir, "stanfordmovie.train.sentences.clean.projected.txt"), output="stanford-movie-vectors.bin", cbow=1, size=300, window=8, negative=25, hs=0, sample=1e-4, threads=20, binary=1, iter=15, min_count=1) trainer.train() gensim_model = gensim.models.Word2Vec.load_word2vec_format( "/users/mdenil/code/txtnets/txtnets_deployed/code/stanford-movie-vectors.bin", binary=True) # print(gensim_model.most_similar(["refund"])) # print(gensim_model.most_similar(["amazing"])) embedding_model = txtnets_model_from_gensim_word2vec(gensim_model) with open(os.path.join(data_dir, "stanfordmovie.train.sentences.clean.projected.flat.json")) as data_file: data = json.load(data_file) random.shuffle(data) X, Y = map(list, zip(*data)) Y = [[":)", ":("].index(y) for y in Y] batch_size = 100 n_validation = 500 train_data_provider = LabelledSequenceMinibatchProvider( X=X[:-n_validation], Y=Y[:-n_validation], batch_size=batch_size, padding='PADDING') transformed_train_data_provider = TransformedLabelledDataProvider( data_source=train_data_provider, transformer=embedding_model) validation_data_provider = LabelledSequenceMinibatchProvider( X=X[-n_validation:], Y=Y[-n_validation:], batch_size=batch_size, padding='PADDING') transformed_validation_data_provider = TransformedLabelledDataProvider( data_source=validation_data_provider, transformer=embedding_model) logistic_regression = CSM( layers=[ Sum(axes=['w']), Softmax( n_input_dimensions=gensim_model.syn0.shape[1], n_classes=2) ] ) cost_function = CrossEntropy() regularizer = L2Regularizer(lamb=1e-4) objective = CostMinimizationObjective( cost=cost_function, data_provider=transformed_train_data_provider, regularizer=regularizer) update_rule = AdaGrad( gamma=0.1, model_template=logistic_regression) optimizer = SGD( model=logistic_regression, objective=objective, update_rule=update_rule) for batch_index, iteration_info in enumerate(optimizer): if batch_index % 100 == 0: # print(iteration_info['cost']) Y_hat = [] Y_valid = [] for _ in xrange(transformed_validation_data_provider.batches_per_epoch): X_valid_batch, Y_valid_batch, meta_valid = transformed_validation_data_provider.next_batch() Y_valid.append(get(Y_valid_batch)) Y_hat.append(get(logistic_regression.fprop(X_valid_batch, meta=meta_valid))) Y_valid = np.concatenate(Y_valid, axis=0) Y_hat = np.concatenate(Y_hat, axis=0) acc = np.mean(np.argmax(Y_hat, axis=1) == np.argmax(Y_valid, axis=1)) print("B: {}, A: {}, C: {}".format( batch_index, acc, iteration_info['cost'])) with open("model_w2vec_logreg.pkl", 'w') as model_file: pickle.dump(embedding_model.move_to_cpu(), model_file, protocol=-1) pickle.dump(logistic_regression.move_to_cpu(), model_file, protocol=-1)
model = CSM( layers=[ WordEmbedding( dimension=embedding_dimension, vocabulary_size=len(alphabet)), SentenceConvolution( n_feature_maps=n_feature_maps, kernel_width=kernel_width, n_channels=1, n_input_dimensions=embedding_dimension), SumFolding(), KMaxPooling(k=pooling_size), # Bias( # n_input_dims=embedding_dimension / 2, # n_feature_maps=n_feature_maps), Linear( n_input=n_feature_maps*pooling_size*embedding_dimension / 2, n_output=64 ), Tanh(), Linear( n_output=1, n_input=64), ] )
vocabulary_size = len(alphabet) n_feature_maps = 8 kernel_width = 5 pooling_size = 2 n_epochs = 1 model = CSM(layers=[ WordEmbedding(dimension=embedding_dimension, vocabulary_size=len(alphabet)), SentenceConvolution(n_feature_maps=n_feature_maps, kernel_width=kernel_width, n_channels=1, n_input_dimensions=embedding_dimension), SumFolding(), KMaxPooling(k=pooling_size), # Bias( # n_input_dims=embedding_dimension / 2, # n_feature_maps=n_feature_maps), Linear(n_input=n_feature_maps * pooling_size * embedding_dimension / 2, n_output=64), Tanh(), Linear(n_output=1, n_input=64), ]) print model cost_function = LargeMarginCost(0.1) noise_model = RandomAlphabetCorruption(alphabet) objective = NoiseContrastiveObjective(cost=cost_function,
model = CSM(layers=[ DictionaryEncoding(vocabulary=encoding), WordEmbedding(dimension=20, vocabulary_size=len(encoding), padding=encoding['PADDING']), Dropout(('b', 'w', 'f'), 0.2), SentenceConvolution(n_feature_maps=10, kernel_width=15, n_channels=20, n_input_dimensions=1), Bias(n_input_dims=1, n_feature_maps=10), KMaxPooling(k=7, k_dynamic=0.5), Tanh(), SentenceConvolution(n_feature_maps=30, kernel_width=9, n_channels=10, n_input_dimensions=1), Bias(n_input_dims=1, n_feature_maps=30), KMaxPooling(k=5), Tanh(), ReshapeForDocuments(), SentenceConvolution(n_feature_maps=20, kernel_width=11, n_channels=30 * 5, n_input_dimensions=1), Bias(n_input_dims=1, n_feature_maps=20), KMaxPooling(k=5), Tanh(), Dropout(('b', 'd', 'f', 'w'), 0.5), Softmax(n_classes=2, n_input_dimensions=100), ])
def main(): random.seed(34532) np.random.seed(675) np.set_printoptions(linewidth=100) data_dir = os.path.join("/users/mdenil/code/txtnets/txtnets_deployed/data", "stanfordmovie") trainer = Word2Vec(train=os.path.join( data_dir, "stanfordmovie.train.sentences.clean.projected.txt"), output="stanford-movie-vectors.bin", cbow=1, size=300, window=8, negative=25, hs=0, sample=1e-4, threads=20, binary=1, iter=15, min_count=1) trainer.train() gensim_model = gensim.models.Word2Vec.load_word2vec_format( "/users/mdenil/code/txtnets/txtnets_deployed/code/stanford-movie-vectors.bin", binary=True) # print(gensim_model.most_similar(["refund"])) # print(gensim_model.most_similar(["amazing"])) embedding_model = txtnets_model_from_gensim_word2vec(gensim_model) with open( os.path.join( data_dir, "stanfordmovie.train.sentences.clean.projected.flat.json") ) as data_file: data = json.load(data_file) random.shuffle(data) X, Y = map(list, zip(*data)) Y = [[":)", ":("].index(y) for y in Y] batch_size = 100 n_validation = 500 train_data_provider = LabelledSequenceMinibatchProvider( X=X[:-n_validation], Y=Y[:-n_validation], batch_size=batch_size, padding='PADDING') transformed_train_data_provider = TransformedLabelledDataProvider( data_source=train_data_provider, transformer=embedding_model) validation_data_provider = LabelledSequenceMinibatchProvider( X=X[-n_validation:], Y=Y[-n_validation:], batch_size=batch_size, padding='PADDING') transformed_validation_data_provider = TransformedLabelledDataProvider( data_source=validation_data_provider, transformer=embedding_model) logistic_regression = CSM(layers=[ Sum(axes=['w']), Softmax(n_input_dimensions=gensim_model.syn0.shape[1], n_classes=2) ]) cost_function = CrossEntropy() regularizer = L2Regularizer(lamb=1e-4) objective = CostMinimizationObjective( cost=cost_function, data_provider=transformed_train_data_provider, regularizer=regularizer) update_rule = AdaGrad(gamma=0.1, model_template=logistic_regression) optimizer = SGD(model=logistic_regression, objective=objective, update_rule=update_rule) for batch_index, iteration_info in enumerate(optimizer): if batch_index % 100 == 0: # print(iteration_info['cost']) Y_hat = [] Y_valid = [] for _ in xrange( transformed_validation_data_provider.batches_per_epoch): X_valid_batch, Y_valid_batch, meta_valid = transformed_validation_data_provider.next_batch( ) Y_valid.append(get(Y_valid_batch)) Y_hat.append( get( logistic_regression.fprop(X_valid_batch, meta=meta_valid))) Y_valid = np.concatenate(Y_valid, axis=0) Y_hat = np.concatenate(Y_hat, axis=0) acc = np.mean( np.argmax(Y_hat, axis=1) == np.argmax(Y_valid, axis=1)) print("B: {}, A: {}, C: {}".format(batch_index, acc, iteration_info['cost'])) with open("model_w2vec_logreg.pkl", 'w') as model_file: pickle.dump(embedding_model.move_to_cpu(), model_file, protocol=-1) pickle.dump(logistic_regression.move_to_cpu(), model_file, protocol=-1)
## BUILD THE MODEL model = CSM( layers=[ WordEmbedding(dimension=embedding_dimension, vocabulary_size=vocabulary_size), SentenceConvolution(n_feature_maps=n_feature_maps, kernel_width=kernel_width, n_channels=1, n_input_dimensions=embedding_dimension), SumFolding(), KMaxPooling(k=pooling_size * 2), Bias(n_input_dims=embedding_dimension / 2, n_feature_maps=n_feature_maps), Tanh(), # Softmax( # n_classes=n_classes, # n_input_dimensions=420), SentenceConvolution(n_feature_maps=n_feature_maps, kernel_width=3, n_channels=n_feature_maps, n_input_dimensions=embedding_dimension / 2), KMaxPooling(k=pooling_size), Bias(n_input_dims=embedding_dimension / 2, n_feature_maps=n_feature_maps), Tanh(), Softmax(n_classes=n_classes, n_input_dimensions=420), ], ) print model
model = CSM( layers=[ WordEmbedding( dimension=embedding_dimension, vocabulary_size=vocabulary_size), SentenceConvolution( n_feature_maps=n_feature_maps, kernel_width=kernel_width, n_channels=1, n_input_dimensions=embedding_dimension), # KMaxPooling(k=pooling_size), # TODO: make a bias that runs along the w dimension Bias( n_input_dims=embedding_dimension, n_feature_maps=n_feature_maps), MaxFolding(), SentenceConvolution( n_feature_maps=3, kernel_width=5, n_channels=n_feature_maps, n_input_dimensions=embedding_dimension / 2), MaxFolding(), MaxFolding(), Softmax( n_classes=n_classes, n_input_dimensions=3*(context_length + kernel_width - 1 + 5 - 1)*embedding_dimension / 8), ] )
model = CSM( layers=[ DictionaryEncoding(vocabulary=encoding), WordEmbedding( dimension=20, vocabulary_size=len(encoding), padding=encoding['PADDING']), Dropout(('b', 'w', 'f'), 0.2), SentenceConvolution( n_feature_maps=10, kernel_width=15, n_channels=20, n_input_dimensions=1), Bias( n_input_dims=1, n_feature_maps=10), KMaxPooling(k=7, k_dynamic=0.5), Tanh(), SentenceConvolution( n_feature_maps=30, kernel_width=9, n_channels=10, n_input_dimensions=1), Bias( n_input_dims=1, n_feature_maps=30), KMaxPooling(k=5), Tanh(), ReshapeForDocuments(), SentenceConvolution( n_feature_maps=20, kernel_width=11, n_channels=30*5, n_input_dimensions=1), Bias( n_input_dims=1, n_feature_maps=20), KMaxPooling(k=5), Tanh(), Dropout(('b', 'd', 'f', 'w'), 0.5), Softmax( n_classes=2, n_input_dimensions=100), ] )
train_data_provider = LabelledSequenceMinibatchProvider(X=X[:-500], Y=Y[:-500], batch_size=100) print train_data_provider.batches_per_epoch validation_data_provider = LabelledSequenceMinibatchProvider( X=X[-500:], Y=Y[-500:], batch_size=500) word_embedding_model = CSM(layers=[ WordEmbedding( # really a character embedding dimension=16, vocabulary_size=len(alphabet)), SentenceConvolution(n_feature_maps=10, kernel_width=5, n_channels=1, n_input_dimensions=16), SumFolding(), KMaxPooling(k=2), MaxFolding(), Tanh(), ]) word_embedding = WordFromCharacterEmbedding( embedding_model=word_embedding_model, alphabet_encoding=alphabet) # print word_embedding.fprop(X, meta) tweet_model = CSM(layers=[ word_embedding, SentenceConvolution(n_feature_maps=5,
# ] # ) tweet_model = CSM(layers=[ # cpu.model.encoding. DictionaryEncoding(vocabulary=alphabet), # cpu.model.embedding. WordEmbedding(dimension=28, vocabulary_size=len(alphabet)), # HostToDevice(), SentenceConvolution(n_feature_maps=6, kernel_width=7, n_channels=1, n_input_dimensions=28), Bias(n_input_dims=28, n_feature_maps=6), SumFolding(), KMaxPooling(k=4, k_dynamic=0.5), Tanh(), SentenceConvolution(n_feature_maps=14, kernel_width=5, n_channels=6, n_input_dimensions=14), Bias(n_input_dims=14, n_feature_maps=14), SumFolding(), KMaxPooling(k=4), Tanh(), Softmax(n_classes=2, n_input_dimensions=392), ]) print tweet_model