def data_dense(): """Data to feed into code prediction model""" base = os.environ['DATA_ROOT'] train_dir = os.path.join(base, cfg.get('data', 'train')) test_dir = os.path.join(base, cfg.get('data', 'test')) # load pre-trained model rl = cfg.get('data', 'rep_layer') model = load_model(cfg.get('data', 'model_file')) interm_layer_model = Model(inputs=model.input, outputs=model.get_layer(rl).output) maxlen = model.get_layer(name='EL').get_config()['input_length'] # load target task training data dataset_provider = dataset.DatasetProvider( train_dir, cfg.get('data', 'alphabet_pickle')) x_train, y_train = dataset_provider.load_keras(maxlen=maxlen) x_train = pad_sequences(x_train, maxlen=maxlen) # make training vectors for target task print('x_train shape (original):', x_train.shape) x_train = interm_layer_model.predict(x_train) print('x_train shape (new):', x_train.shape) # now load the test set dataset_provider = dataset.DatasetProvider( test_dir, cfg.get('data', 'alphabet_pickle')) x_test, y_test = dataset_provider.load_keras(maxlen=maxlen) x_test = pad_sequences(x_test, maxlen=maxlen) # make test vectors for target task x_test = interm_layer_model.predict(x_test) return x_train, y_train, x_test, y_test
def grid_search(): """Grid search using sklearn API""" # load target task train and test data base = os.environ['DATA_ROOT'] train_dir = os.path.join(base, cfg.get('data', 'train')) test_dir = os.path.join(base, cfg.get('data', 'test')) maxlen = get_maxlen() dataset_provider = dataset.DatasetProvider( train_dir, cfg.get('data', 'alphabet_pickle')) x_train, y_train = dataset_provider.load_keras(maxlen=maxlen) x_train = pad_sequences(x_train, maxlen=maxlen) classifier = KerasClassifier(make_model) param_grid = { 'C':[0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000], 'epochs':[3, 5, 7]} validator = GridSearchCV( classifier, param_grid, scoring='f1_macro', cv=5, n_jobs=1) validator.fit(x_train, y_train) print('best param:', validator.best_params_)
def main(): """Do out-of-core training here""" configure_model_dir() base = os.environ['DATA_ROOT'] dp = dataset.DatasetProvider(os.path.join(base, cfg.get('data', 'train')), cfg.get('args', 'max_files'), cfg.getint('args', 'max_cuis'), cfg.getint('args', 'samples_per_doc'), cfg.getint('bow', 'batch'), cfg.getboolean('args', 'make_alphabet'), cfg.getboolean('args', 'verbose')) max_cuis = int(cfg.get('args', 'max_cuis')) model = get_model(max_cuis, max_cuis - 1) optim = getattr(optimizers, cfg.get('bow', 'optimizer')) model.compile(loss='binary_crossentropy', optimizer=optim(lr=10**cfg.getint('bow', 'log10lr')), metrics=['accuracy']) callback = ModelCheckpoint('Model/model.h5', verbose=1, save_best_only=True) # load validation data val_x, val_y = dp.load(os.path.join(base, cfg.get('data', 'dev'))) print('dev x, y shapes:', val_x.shape, val_y.shape) steps = math.ceil(dp.train_size / cfg.getint('bow', 'batch')) print('steps per epoch:', steps) model.fit_generator(dp.stream(), validation_data=(val_x, val_y), epochs=cfg.getint('bow', 'epochs'), steps_per_epoch=steps, verbose=0, callbacks=[callback]) # save final model model.save('Model/final.h5') # probability for each class; (test size, num of classes) distribution = model.predict(val_x) # turn into an indicator matrix distribution[distribution < 0.5] = 0 distribution[distribution >= 0.5] = 1 f1 = f1_score(val_y, distribution, average='macro') p = precision_score(val_y, distribution, average='macro') r = recall_score(val_y, distribution, average='macro') print("\nmacro: p: %.3f - r: %.3f - f1: %.3f" % (p, r, f1)) f1 = f1_score(val_y, distribution, average='micro') p = precision_score(val_y, distribution, average='micro') r = recall_score(val_y, distribution, average='micro') print("micro: p: %.3f - r: %.3f - f1: %.3f" % (p, r, f1))
def main(args): #np.random.seed(1337) if len(args) < 1: sys.stderr.write("Error - one required argument: <data directory>\n") sys.exit(-1) working_dir = args[0] data_file = os.path.join(working_dir, 'training-data.liblinear') # learn alphabet from training data provider = dataset.DatasetProvider(data_file) # now load training examples and labels train_x, train_y = provider.load(data_file) # turn x and y into numpy array among other things maxlen = max([len(seq) for seq in train_x]) classes = len(set(train_y)) train_x = pad_sequences(train_x, maxlen=maxlen) train_y = to_categorical(np.array(train_y), classes) #loading pre-trained embedding file: embeddings_index = {} f = open(os.path.join(working_dir, 'mimic.txt')) values = f.readline().split() EMBEDDING_WORDNUM = int(values[0]) EMBEDDING_DIM = int(values[1]) for line in f: values = line.split() word = values[0] coefs = np.asarray(values[1:], dtype='float32') embeddings_index[word] = coefs f.close() print('load embeddings for %s=%s words.' % (len(embeddings_index), EMBEDDING_WORDNUM)) # prepare embedding matrix nb_words = len(provider.word2int) embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM)) for word, i in provider.word2int.items(): embedding_vector = embeddings_index.get(word) if embedding_vector is not None: # words not found in embedding index will be all-zeros. embedding_matrix[i] = embedding_vector print 'train_x shape:', train_x.shape print 'train_y shape:', train_y.shape #train_x, valid_x, train_y, valid_y = train_test_split(train_x, train_y, test_size=0.1, random_state=18) optim = RandomSearch( lambda: get_random_config(), lambda x, y: run_one_eval( x, y, train_x, train_y, maxlen, len(provider.word2int), classes, embedding_matrix, EMBEDDING_DIM)) best_config = optim.optimize() print("Best config: %s" % best_config) sys.exit(0)
def main(): """Driver function""" base = os.environ['DATA_ROOT'] dp = dataset.DatasetProvider(os.path.join(base, cfg.get('data', 'train')), cfg.get('data', 'model_dir'), cfg.getint('args', 'max_seq_len'), cfg.get('args', 'n_files'), cfg.get('args', 'n_x1_cuis'), cfg.get('args', 'n_x2_cuis')) x1, x2, y = dp.load() print('x1 shape:', x1.shape) print('x2 shape:', x2.shape) print('y shape:', y.shape) train_x1, val_x1, train_x2, val_x2, train_y, val_y = train_test_split( x1, x2, y, test_size=cfg.getfloat('args', 'test_size')) # TODO: figure out what to do about negated cuis init_vectors = None if cfg.has_option('data', 'embed'): embed_file = os.path.join(base, cfg.get('data', 'embed')) w2v = word2vec.Model(embed_file, verbose=True) init_vectors = [w2v.select_vectors(dp.tokenizer.word_index)] model = get_model_concat_no_sharing( len(dp.tokenizer.word_index) + 1, x1.shape[1], init_vectors) model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy']) # save the model after every epoch callback = ModelCheckpoint(cfg.get('data', 'model_dir') + 'model.h5', verbose=1, save_best_only=True) model.fit([train_x1, train_x2], train_y, validation_data=([val_x1, val_x2], val_y), epochs=cfg.getint('dan', 'epochs'), batch_size=cfg.getint('dan', 'batch'), validation_split=0.0, callbacks=[callback]) # are we training the best model? if cfg.getfloat('args', 'test_size') == 0: model.save(cfg.get('data', 'model_dir') + 'model.h5') exit() probs = model.predict([val_x1, val_x2]) predictions = (probs > 0.5).astype(int) accuracy = accuracy_score(val_y, predictions) print('accuracy: ', accuracy)
def main(args): if len(args) < 1: sys.stderr.write("Error - one required argument: <data directory>\n") sys.exit(-1) working_dir = args[0] data_file = os.path.join(working_dir, 'training-data.liblinear') # learn alphabet from training data provider = dataset.DatasetProvider(data_file) # now load training examples and labels train_x, train_y = provider.load(data_file) # turn x and y into numpy array among other things maxlen = max([len(seq) for seq in train_x]) classes = len(set(train_y)) train_x = pad_sequences(train_x, maxlen=maxlen) train_y = to_categorical(np.array(train_y), classes) pickle.dump(maxlen, open(os.path.join(working_dir, 'maxlen.p'), "wb")) pickle.dump(provider.word2int, open(os.path.join(working_dir, 'word2int.p'), "wb")) pickle.dump(provider.label2int, open(os.path.join(working_dir, 'label2int.p'), "wb")) print 'train_x shape:', train_x.shape print 'train_y shape:', train_y.shape model = Sequential() model.add(Embedding(len(provider.word2int), 300, input_length=maxlen)) model.add(GlobalAveragePooling1D()) model.add(Dropout(0.25)) model.add(Dense(1000, kernel_regularizer=regularizers.l2(0.00001))) model.add(Activation('relu')) model.add(Dropout(0.25)) model.add(Dense(classes, kernel_regularizer=regularizers.l2(0.00001))) model.add(Activation('softmax')) optimizer = RMSprop(lr=0.0005, rho=0.9, epsilon=1e-08) model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy']) model.fit(train_x, train_y, epochs=10, batch_size=50, verbose=0, validation_split=0.0, class_weight=None) json_string = model.to_json() open(os.path.join(working_dir, 'model_0.json'), 'w').write(json_string) model.save_weights(os.path.join(working_dir, 'model_0.h5'), overwrite=True) sys.exit(0)
def data_sparse(): """Bag-of-cuis data for sparse evaluation""" base = os.environ['DATA_ROOT'] train_dir = os.path.join(base, cfg.get('data', 'train')) test_dir = os.path.join(base, cfg.get('data', 'test')) # load training data dataset_provider = dataset.DatasetProvider(train_dir) x_train, y_train = dataset_provider.load_sklearn() # load test data dataset_provider = dataset.DatasetProvider(test_dir) x_test, y_test = dataset_provider.load_sklearn() # turn xs into tfidf vectors vectorizer = TfidfVectorizer() x_train = vectorizer.fit_transform(x_train) x_test = vectorizer.transform(x_test) return x_train.toarray(), y_train, x_test.toarray(), y_test
def main(): """Driver function""" base = os.environ['DATA_ROOT'] dp = dataset.DatasetProvider(os.path.join(base, cfg.get('data', 'train')), cfg.get('data', 'model_dir'), cfg.get('args', 'n_x_cuis'), cfg.get('args', 'n_y_cuis'), cfg.getfloat('args', 'min_examples_per_targ')) x, y = dp.load() print('x shape:', x.shape) print('y shape:', y.shape) fixed_args = { 'vocabulary_size': len(dp.tokenizer.word_index) + 1, 'max_seq_len': x.shape[1], 'n_targets': y.shape[1], 'init_vectors': None, 'loss': 'binary_crossentropy', 'epochs': cfg.getint('search', 'max_epochs') } param_space = { 'emb_dim': (512, 1024, 2048, 4096), 'hidden': (1000, 3000, 5000, 10000), 'activation': ('linear', 'tanh', 'relu'), 'dropout': uniform(0, 0.75), 'optimizer': ('RMSprop', 'Adam'), 'log10lr': (-5, -4, -3, -2), 'batch': (4, 8, 16, 32, 64) } config2score = rndsearch.run(make_model, fixed_args, param_space, x, y, n=cfg.getint('search', 'n'), verbose=1) # display configs sorted by f1 print('\nconfigurations sorted by score:') sorted_by_value = sorted(config2score, key=config2score.get) for config in sorted_by_value: print('%s: %.3f' % (config, config2score[config])) best_config = dict(sorted_by_value[-1]) print('best config:', best_config) print('best score:', config2score[sorted_by_value[-1]])
def fine_tune(): """Fine tuning dense vectors""" # load target task train and test data base = os.environ['DATA_ROOT'] train_dir = os.path.join(base, cfg.get('data', 'train')) test_dir = os.path.join(base, cfg.get('data', 'test')) maxlen = get_maxlen() dataset_provider = dataset.DatasetProvider( train_dir, cfg.get('data', 'alphabet_pickle')) x_train, y_train = dataset_provider.load_keras(maxlen=maxlen) x_train = pad_sequences(x_train, maxlen=maxlen) dataset_provider = dataset.DatasetProvider( test_dir, cfg.get('data', 'alphabet_pickle')) x_test, y_test = dataset_provider.load_keras(maxlen=maxlen) x_test = pad_sequences(x_test, maxlen=maxlen) # train and evaluate model = make_model() epochs = cfg.getint('data', 'epochs') model.fit(x_train, y_train, epochs=epochs, validation_split=0.0) predictions = model.predict_classes(x_test) probs = model.predict(x_test) p = precision_score(y_test, predictions, average='macro') r = recall_score(y_test, predictions, average='macro') f1 = f1_score(y_test, predictions, average='macro') print("precision: %.3f - recall: %.3f - f1: %.3f" % (p, r, f1)) accuracy = accuracy_score(y_test, predictions) roc_auc = roc_auc_score(y_test, probs) print("auc: %.3f - accuracy: %.3f" % (roc_auc, accuracy))
def main(args): if len(args) < 1: sys.stderr.write("Error - one required argument: <data directory>\n") sys.exit(-1) working_dir = args[0] data_file = os.path.join(working_dir, 'training-data.liblinear') # learn alphabet from training data provider = dataset.DatasetProvider(data_file) # now load training examples and labels train_x, train_y = provider.load(data_file) # turn x and y into numpy array among other things maxlen = max([len(seq) for seq in train_x]) classes = len(set(train_y)) train_x = pad_sequences(train_x, maxlen=maxlen) train_y = to_categorical(np.array(train_y), classes) pickle.dump(maxlen, open(os.path.join(working_dir, 'maxlen.p'),"wb")) pickle.dump(provider.word2int, open(os.path.join(working_dir, 'word2int.p'),"wb")) pickle.dump(provider.label2int, open(os.path.join(working_dir, 'label2int.p'),"wb")) w2v = word2vec.Model('/home/dima/Data/Word2VecModels/mimic.txt') init_vectors = [w2v.select_vectors(provider.word2int)] model = get_model(len(provider.word2int), maxlen, init_vectors, classes) optimizer = RMSprop(lr=LEARN_RATE, rho=0.9, epsilon=1e-08) model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy']) model.fit(train_x, train_y, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE, verbose=0, validation_split=0.0) json_string = model.to_json() open(os.path.join(working_dir, 'model_0.json'), 'w').write(json_string) model.save_weights(os.path.join(working_dir, 'model_0.h5'), overwrite=True) sys.exit(0)
from sklearn.model_selection import cross_val_score import keras as k from keras.preprocessing.sequence import pad_sequences from keras.models import load_model from keras.models import Model import dataset if __name__ == "__main__": cfg = ConfigParser.ConfigParser() cfg.read(sys.argv[1]) base = os.environ['DATA_ROOT'] data_dir = os.path.join(base, cfg.get('data', 'path')) # load target task data dataset = dataset.DatasetProvider(data_dir, cfg.get('data', 'alphabet_pickle')) x, y = dataset.load() # pad to same maxlen as data in source model x = pad_sequences(x, maxlen=cfg.getint('data', 'maxlen')) print 'x shape (original):', x.shape # make vectors for target task model = load_model(cfg.get('data', 'model_file')) interm_layer_model = Model(inputs=model.input, outputs=model.get_layer('HL').output) x = interm_layer_model.predict(x) print 'x shape (new):', x.shape # ready for svm train/test now
def main(args): if len(args) < 1: sys.stderr.write("Error - one required argument: <data directory>\n") sys.exit(-1) working_dir = args[0] data_file = os.path.join(working_dir, 'training-data.liblinear') # learn alphabet from training data provider = dataset.DatasetProvider(data_file) # now load training examples and labels train_x, train_y = provider.load(data_file) # turn x and y into numpy array among other things maxlen = max([len(seq) for seq in train_x]) classes = len(set(train_y)) train_x = pad_sequences(train_x, maxlen=maxlen) train_y = to_categorical(np.array(train_y), classes) pickle.dump(maxlen, open(os.path.join(working_dir, 'maxlen.p'), "wb")) pickle.dump(provider.word2int, open(os.path.join(working_dir, 'word2int.p'), "wb")) pickle.dump(provider.label2int, open(os.path.join(working_dir, 'label2int.p'), "wb")) print 'train_x shape:', train_x.shape print 'train_y shape:', train_y.shape branches = [] # models to be merged train_xs = [] # train x for each branch for filter_len in '2,3,4,5'.split(','): branch = Sequential() branch.add( Embedding(len(provider.word2int), 300, input_length=maxlen, weights=None)) branch.add( Convolution1D(nb_filter=200, filter_length=int(filter_len), border_mode='valid', activation='relu', subsample_length=1)) branch.add(MaxPooling1D(pool_length=2)) branch.add(Flatten()) branches.append(branch) train_xs.append(train_x) model = Sequential() model.add(Merge(branches, mode='concat')) model.add(Dropout(0.25)) model.add(Dense(300)) model.add(Activation('relu')) model.add(Dropout(0.25)) model.add(Dense(classes)) model.add(Activation('softmax')) optimizer = RMSprop(lr=0.0001, rho=0.9, epsilon=1e-08) model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy']) model.fit(train_xs, train_y, nb_epoch=4, batch_size=50, verbose=0, validation_split=0.1) json_string = model.to_json() open(os.path.join(working_dir, 'model_0.json'), 'w').write(json_string) model.save_weights(os.path.join(working_dir, 'model_0.h5'), overwrite=True) sys.exit(0)
def main(args): if len(args) < 1: sys.stderr.write("Error - one required argument: <data directory>\n") sys.exit(-1) working_dir = args[0] #read in data file # print("Reading data...") #Y, X = ctk_io.read_liblinear(working_dir) # ('data_testing/multitask_assertion/train_and_test') data_file = os.path.join(working_dir, 'training-data.liblinear') # learn alphabet from training and test data dataset1 = dataset.DatasetProvider([data_file]) # now load training examples and labels train_x, train_y = dataset1.load(data_file) init_vectors = None #used for pre-trained embeddings # turn x and y into numpy array among other things maxlen = max([len(seq) for seq in train_x]) outcomes = set(train_y) classes = len(outcomes) train_x = pad_sequences(train_x, maxlen=maxlen) train_y = to_categorical(np.array(train_y), classes) pickle.dump(maxlen, open(os.path.join(working_dir, 'maxlen.p'),"wb")) pickle.dump(dataset1.alphabet, open(os.path.join(working_dir, 'alphabet.p'),"wb")) #test_x = pad_sequences(test_x, maxlen=maxlen) #test_y = to_categorical(np.array(test_y), classes) print 'train_x shape:', train_x.shape print 'train_y shape:', train_y.shape branches = [] # models to be merged train_xs = [] # train x for each branch #test_xs = [] # test x for each branch filtlens = "3,4,5" for filter_len in filtlens.split(','): branch = Sequential() branch.add(Embedding(len(dataset1.alphabet), 300, input_length=maxlen, weights=init_vectors)) branch.add(Convolution1D(nb_filter=200, filter_length=int(filter_len), border_mode='valid', activation='relu', subsample_length=1)) branch.add(MaxPooling1D(pool_length=2)) branch.add(Flatten()) branches.append(branch) train_xs.append(train_x) #test_xs.append(test_x) model = Sequential() model.add(Merge(branches, mode='concat')) model.add(Dense(250))#cfg.getint('cnn', 'hidden'))) model.add(Dropout(0.25))#cfg.getfloat('cnn', 'dropout'))) model.add(Activation('relu')) model.add(Dropout(0.25))#cfg.getfloat('cnn', 'dropout'))) model.add(Dense(classes)) model.add(Activation('softmax')) optimizer = RMSprop(lr=0.0001,#cfg.getfloat('cnn', 'learnrt'), rho=0.9, epsilon=1e-08) model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy']) model.fit(train_xs, train_y, nb_epoch=3,#cfg.getint('cnn', 'epochs'), batch_size=50,#cfg.getint('cnn', 'batches'), verbose=1, validation_split=0.1, class_weight=None) model.summary() json_string = model.to_json() open(os.path.join(working_dir, 'model_0.json'), 'w').write(json_string) model.save_weights(os.path.join(working_dir, 'model_0.h5'), overwrite=True) sys.exit(0)
def main(args): if len(args) < 1: sys.stderr.write("Error - one required argument: <data directory>\n") sys.exit(-1) working_dir = args[0] #read in data file # print("Reading data...") #Y, X = ctk_io.read_liblinear(working_dir) # ('data_testing/multitask_assertion/train_and_test') data_file = os.path.join(working_dir, 'training-data.liblinear') # learn alphabet from training and test data dataset1 = dataset.DatasetProvider([data_file]) # now load training examples and labels train_x, train_y = dataset1.load(data_file) init_vectors = None #used for pre-trained embeddings # turn x and y into numpy array among other things maxlen = max([len(seq) for seq in train_x]) outcomes = set(train_y) classes = len(outcomes) train_x = pad_sequences(train_x, maxlen=maxlen) train_y = to_categorical(np.array(train_y), classes) pickle.dump(maxlen, open(os.path.join(working_dir, 'maxlen.p'), "wb")) pickle.dump(dataset1.alphabet, open(os.path.join(working_dir, 'alphabet.p'), "wb")) #test_x = pad_sequences(test_x, maxlen=maxlen) #test_y = to_categorical(np.array(test_y), classes) print 'train_x shape:', train_x.shape print 'train_y shape:', train_y.shape #branches = [] # models to be merged #train_xs = [] # train x for each branch #test_xs = [] # test x for each branch model = resnet(maxlen, dataset1.alphabet, classes) optimizer = RMSprop( lr=0.0001, #cfg.getfloat('cnn', 'learnrt'), rho=0.9, epsilon=1e-08) model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy']) model.fit( train_x, train_y, nb_epoch=3, #cfg.getint('cnn', 'epochs'), batch_size=50, #cfg.getint('cnn', 'batches'), verbose=1, validation_split=0.1, class_weight=None) model.summary() json_string = model.to_json() open(os.path.join(working_dir, 'model_0.json'), 'w').write(json_string) model.save_weights(os.path.join(working_dir, 'model_0.h5'), overwrite=True) sys.exit(0)
def main(): """Driver function""" base = os.environ['DATA_ROOT'] dp = dataset.DatasetProvider(os.path.join(base, cfg.get('data', 'train')), cfg.get('data', 'model_dir'), cfg.get('args', 'n_x_cuis'), cfg.get('args', 'n_y_cuis'), cfg.getfloat('args', 'min_examples_per_targ')) x, y = dp.load() print('x shape:', x.shape) print('y shape:', y.shape) # are we training the best model? if cfg.getfloat('args', 'test_size') != 0: train_x, val_x, train_y, val_y = train_test_split( x, y, test_size=cfg.getfloat('args', 'test_size')) validation_data = (val_x, val_y) else: train_x, train_y = x, y validation_data = None # TODO: figure out what to do about negated cuis init_vectors = None if cfg.has_option('data', 'embed'): embed_file = os.path.join(base, cfg.get('data', 'embed')) w2v = word2vec.Model(embed_file, verbose=True) init_vectors = [w2v.select_vectors(dp.tokenizer.word_index)] model = get_model( len(dp.tokenizer.word_index) + 1, x.shape[1], y.shape[1], init_vectors) optim = getattr(optimizers, cfg.get('dan', 'optimizer')) model.compile(loss='binary_crossentropy', optimizer=optim(lr=10**cfg.getint('dan', 'log10lr')), metrics=['accuracy']) # save the model after every epoch callback = ModelCheckpoint(cfg.get('data', 'model_dir') + 'model.h5', verbose=1, save_best_only=True) model.fit(train_x, train_y, validation_data=validation_data, epochs=cfg.getint('dan', 'epochs'), batch_size=cfg.getint('dan', 'batch'), validation_split=0.0, callbacks=[callback]) # are we training the best model? if cfg.getfloat('args', 'test_size') == 0: model.save(cfg.get('data', 'model_dir') + 'model.h5') exit() # probability for each class; (test size, num of classes) distribution = model.predict(val_x) # turn into an indicator matrix distribution[distribution < 0.5] = 0 distribution[distribution >= 0.5] = 1 f1 = f1_score(val_y, distribution, average='macro') p = precision_score(val_y, distribution, average='macro') r = recall_score(val_y, distribution, average='macro') print("\nmacro: p: %.3f - r: %.3f - f1: %.3f" % (p, r, f1)) f1 = f1_score(val_y, distribution, average='micro') p = precision_score(val_y, distribution, average='micro') r = recall_score(val_y, distribution, average='micro') print("micro: p: %.3f - r: %.3f - f1: %.3f" % (p, r, f1))
def main(args): if len(args) < 1: sys.stderr.write("Error - one required argument: <data directory>\n") sys.exit(-1) working_dir = args[0] data_file = os.path.join(working_dir, 'training-data.liblinear') # learn alphabet from training data provider = dataset.DatasetProvider(data_file) # now load training examples and labels train_x, train_y = provider.load(data_file) # turn x and y into numpy array among other things maxlen = max([len(seq) for seq in train_x]) classes = len(set(train_y)) train_x = pad_sequences(train_x, maxlen=maxlen) train_y = to_categorical(np.array(train_y), classes) #loading pre-trained embedding file: embeddings_index = {} f = open(os.path.join(working_dir, 'mimic.txt')) values = f.readline().split() EMBEDDING_WORDNUM = int(values[0]) EMBEDDING_DIM = int(values[1]) for line in f: values = line.split() word = values[0] coefs = np.asarray(values[1:], dtype='float32') embeddings_index[word] = coefs f.close() print('load embeddings for %s=%s words.' % (len(embeddings_index), EMBEDDING_WORDNUM)) # prepare embedding matrix nb_words = len(provider.word2int) embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM)) for word, i in provider.word2int.items(): embedding_vector = embeddings_index.get(word) if embedding_vector is not None: # words not found in embedding index will be all-zeros. embedding_matrix[i] = embedding_vector pickle.dump(maxlen, open(os.path.join(working_dir, 'maxlen.p'), "wb")) pickle.dump(provider.word2int, open(os.path.join(working_dir, 'word2int.p'), "wb")) pickle.dump(provider.label2int, open(os.path.join(working_dir, 'label2int.p'), "wb")) print 'train_x shape:', train_x.shape print 'train_y shape:', train_y.shape LSTM_DIM = 512 DROPOUT = 0.5 input = Input(shape=(maxlen, ), dtype='int32') embed = Embedding(nb_words, EMBEDDING_DIM, mask_zero=True, input_length=maxlen, weights=[embedding_matrix], trainable=True)(input) lstm_fw = LSTM(LSTM_DIM, dropout=DROPOUT, recurrent_dropout=DROPOUT)(embed) lstm_bw = LSTM(LSTM_DIM, dropout=DROPOUT, recurrent_dropout=DROPOUT, go_backwards=True)(embed) cat = concatenate([lstm_fw, lstm_bw]) #drop = Dropout(DROPOUT)(cat) minV = -math.sqrt(6) / math.sqrt(LSTM_DIM * 2 + classes) maxV = math.sqrt(6) / math.sqrt(LSTM_DIM * 2 + classes) randUni = RandomUniform(minval=minV, maxval=maxV, seed=None) out = Dense(classes, activation='softmax', kernel_initializer=randUni, bias_initializer='zeros')(cat) model = Model(inputs=[input], outputs=[out]) #optimizer = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08) optimizer = Adam(lr=0.001) model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy']) stopper = EarlyStopping(monitor='val_loss', patience=10, verbose=0, mode='auto') model.fit(train_x, train_y, epochs=20, batch_size=256, verbose=2, validation_split=0.1, callbacks=[stopper]) json_string = model.to_json() open(os.path.join(working_dir, 'model_0.json'), 'w').write(json_string) model.save_weights(os.path.join(working_dir, 'model_0.h5'), overwrite=True) sys.exit(0)
from keras.layers.convolutional import Convolution1D, MaxPooling1D from keras.layers.embeddings import Embedding NFOLDS = 10 BATCH = 50 EPOCHS = 5 CLASSES = 2 EMBDIMS = 300 MAXLEN = 55 MAXFEATURES = 18000 FILTERS = 100 FILTLEN = 4 if __name__ == "__main__": dataset = dataset.DatasetProvider(MAXFEATURES) x, y = dataset.load_data() # TODO: what what are we doing for index 0 (oov words)? path = '/Users/Dima/Loyola/Data/Word2Vec/Models/GoogleNews-vectors-negative300.txt' word2vec = word2vec_model.Model(path) init_vectors = word2vec.select_vectors(dataset.alphabet) # turn x and y into numpy array among other things x = sequence.pad_sequences(x, maxlen=MAXLEN) y = k.utils.np_utils.to_categorical(np.array(y), CLASSES) scores = [] folds = sk.cross_validation.KFold(len(y), n_folds=NFOLDS, shuffle=True) # todo: look at train_indices and test_indices
return model if __name__ == "__main__": # settings file specified as command-line argument cfg = ConfigParser.ConfigParser() cfg.read(sys.argv[1]) print_config(cfg) base = os.environ['DATA_ROOT'] train_file = os.path.join(base, cfg.get('data', 'train')) test_file = os.path.join(base, cfg.get('data', 'test')) # learn alphabet from training examples dataset = dataset.DatasetProvider(train_file) # now load training examples and labels train_x, train_y = dataset.load(train_file) maxlen = max([len(seq) for seq in train_x]) # now load test examples and labels test_x, test_y = dataset.load(test_file, maxlen=maxlen) init_vectors = None # TODO: what what are we doing for index 0 (oov words)? # use pre-trained word embeddings? if cfg.has_option('data', 'embed'): embed_file = os.path.join(base, cfg.get('data', 'embed')) w2v = word2vec.Model(embed_file) init_vectors = [w2v.select_vectors(dataset.word2int)] # turn x and y into numpy array among other things
def main(args): if len(args) < 1: sys.stderr.write("Error - one required argument: <data directory>\n") sys.exit(-1) working_dir = args[0] data_file = os.path.join(working_dir, 'training-data.liblinear') # learn alphabet from training data provider = dataset.DatasetProvider(data_file) # now load training examples and labels train_x, train_y = provider.load(data_file) # turn x and y into numpy array among other things maxlen = max([len(seq) for seq in train_x]) classes = len(set(train_y)) train_x = pad_sequences(train_x, maxlen=maxlen) train_y = to_categorical(np.array(train_y), classes) #loading pre-trained embedding file: embeddings_index = {} f = open(os.path.join(working_dir, 'mimic.txt')) values = f.readline().split() EMBEDDING_WORDNUM = int(values[0]) EMBEDDING_DIM = int(values[1]) for line in f: values = line.split() word = values[0] coefs = np.asarray(values[1:], dtype='float32') embeddings_index[word] = coefs f.close() print('load embeddings for %s=%s words.' % (len(embeddings_index), EMBEDDING_WORDNUM)) # prepare embedding matrix nb_words = len(provider.word2int) embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM)) for word, i in provider.word2int.items(): embedding_vector = embeddings_index.get(word) if embedding_vector is not None: # words not found in embedding index will be all-zeros. embedding_matrix[i] = embedding_vector pickle.dump(maxlen, open(os.path.join(working_dir, 'maxlen.p'), "wb")) pickle.dump(provider.word2int, open(os.path.join(working_dir, 'word2int.p'), "wb")) pickle.dump(provider.label2int, open(os.path.join(working_dir, 'label2int.p'), "wb")) print 'train_x shape:', train_x.shape print 'train_y shape:', train_y.shape branches = [] # models to be merged train_xs = [] # train x for each branch inflows = [] # placeholder for each input for filter_len in '2,5'.split(','): branch = Input(shape=(maxlen, )) embed = Embedding(len(provider.word2int), EMBEDDING_DIM, weights=[embedding_matrix], trainable=True)(branch) conv = Conv1D(filters=200, kernel_size=int(filter_len), padding='valid', activation='relu', strides=1)(embed) pool = MaxPooling1D(pool_size=2)(conv) flat = Flatten()(pool) branches.append(flat) train_xs.append(train_x) inflows.append(branch) concat = concatenate(branches) drop1 = Dropout(0.25)(concat) dense = Dense(200, activation='relu')(drop1) drop2 = Dropout(0.25)(dense) out = Dense(classes, activation='softmax')(drop2) model = Model(inputs=inflows, outputs=out) #optimizer = RMSprop(lr=0.0001, rho=0.9, epsilon=1e-08) optimizer = Adam(lr=0.001) model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy']) stopper = EarlyStopping(monitor='val_loss', patience=10, verbose=0, mode='auto') model.fit(train_xs, train_y, epochs=20, batch_size=128, verbose=1, validation_split=0.1, callbacks=[stopper]) json_string = model.to_json() open(os.path.join(working_dir, 'model_0.json'), 'w').write(json_string) model.save_weights(os.path.join(working_dir, 'model_0.h5'), overwrite=True) sys.exit(0)
def run(train_file, test_file, batch, epochs, embdims, filters, filtlen, hidden, dropout, learnrt): """Train/test with given parameters. Return F1.""" np.random.seed(1337) print 'train:', train_file print 'test:', test_file print 'batch:', batch print 'epochs:', epochs print 'embdims:', embdims print 'filters:', filters print 'filtlen:', filtlen print 'hidden:', hidden print 'dropout:', dropout print 'learnrt:', learnrt # learn alphabet from training examples datset = dataset.DatasetProvider(train_file) # now load training examples and labels train_x, train_y = datset.load(train_file) maxlen = max([len(seq) for seq in train_x]) # now load test examples and labels test_x, test_y = datset.load(test_file, maxlen=maxlen) # turn x and y into numpy array among other things classes = len(set(train_y)) train_x = pad_sequences(train_x, maxlen=maxlen) train_y = to_categorical(np.array(train_y), classes) test_x = pad_sequences(test_x, maxlen=maxlen) test_y = to_categorical(np.array(test_y), classes) branches = [] # models to be merged train_xs = [] # train x for each branch test_xs = [] # test x for each branch for filter_len in filtlen.split(','): branch = Sequential() branch.add( Embedding(len(datset.word2int), embdims, trainable=False, input_length=maxlen)) branch.add( Convolution1D(nb_filter=filters, filter_length=int(filter_len), border_mode='valid', activation='relu', subsample_length=1)) branch.add(MaxPooling1D(pool_length=2)) branch.add(Flatten()) branches.append(branch) train_xs.append(train_x) test_xs.append(test_x) model = Sequential() model.add(Merge(branches, mode='concat')) model.add(Dropout(dropout)) model.add(Dense(hidden)) model.add(Activation('relu')) model.add(Dropout(dropout)) model.add(Dense(classes)) model.add(Activation('softmax')) optimizer = RMSprop(lr=learnrt, rho=0.9, epsilon=1e-08) model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy']) model.fit(train_xs, train_y, nb_epoch=epochs, batch_size=batch, verbose=0, validation_split=0.0, class_weight=None) # probability for each class; (test size, num of classes) distribution = \ model.predict(test_xs, batch_size=batch) # class predictions; (test size,) predictions = np.argmax(distribution, axis=1) # gold labels; (test size,) gold = np.argmax(test_y, axis=1) # f1 scores label_f1 = f1_score(gold, predictions, average=None) print for label, idx in datset.label2int.items(): print 'f1(%s)=%f' % (label, label_f1[idx]) if 'contains' in datset.label2int: idxs = [datset.label2int['contains'], datset.label2int['contains-1']] contains_f1 = f1_score(gold, predictions, labels=idxs, average='micro') print '\nf1(contains average) =', contains_f1 else: idxs = datset.label2int.values() average_f1 = f1_score(gold, predictions, labels=idxs, average='micro') print 'f1(all) =', average_f1 print '******************************************'
def main(): """Driver function""" base = os.environ['DATA_ROOT'] dp = dataset.DatasetProvider( os.path.join(base, cfg.get('data', 'cuis')), os.path.join(base, cfg.get('data', 'codes')), cfg.get('args', 'max_cuis'), cfg.get('args', 'max_codes')) x, y = dp.load() print('x shape:', x.shape) print('y shape:', y.shape) # are we training the best model? if cfg.getfloat('args', 'test_size') != 0: train_x, val_x, train_y, val_y = train_test_split( x, y, test_size=cfg.getfloat('args', 'test_size')) validation_data = (val_x, val_y) else: train_x, train_y = x, y validation_data = None # need to add one to account for the index 0 which is not used model = get_model(x.shape[1], y.shape[1]) optim = getattr(optimizers, cfg.get('bow', 'optimizer')) model.compile(loss='binary_crossentropy', optimizer=optim(lr=10**cfg.getint('bow', 'log10lr')), metrics=['accuracy']) # save the model after every epoch callback = ModelCheckpoint( cfg.get('data', 'model_dir') + 'model.h5', verbose=1, save_best_only=True) model.fit(train_x, train_y, validation_data=validation_data, epochs=cfg.getint('bow', 'epochs'), batch_size=cfg.getint('bow', 'batch'), validation_split=0.0, callbacks=[callback]) # are we training the best model? if cfg.getfloat('args', 'test_size') == 0: model.save(cfg.get('data', 'model_dir') + 'model.h5') exit() # probability for each class; (test size, num of classes) distribution = model.predict(val_x) # turn into an indicator matrix distribution[distribution < 0.5] = 0 distribution[distribution >= 0.5] = 1 f1 = f1_score(val_y, distribution, average='macro') p = precision_score(val_y, distribution, average='macro') r = recall_score(val_y, distribution, average='macro') print("\nmacro: p: %.3f - r: %.3f - f1: %.3f" % (p, r, f1)) f1 = f1_score(val_y, distribution, average='micro') p = precision_score(val_y, distribution, average='micro') r = recall_score(val_y, distribution, average='micro') print("micro: p: %.3f - r: %.3f - f1: %.3f" % (p, r, f1))
model.add(Dense(classes)) model.add(Activation('softmax')) return model if __name__ == "__main__": cfg = ConfigParser.ConfigParser() cfg.read(sys.argv[1]) print_config(cfg) base = os.environ['DATA_ROOT'] data_dir = os.path.join(base, cfg.get('data', 'path')) dataset = dataset.DatasetProvider( data_dir, cfg.getint('args', 'min_token_freq')) x, y = dataset.load() classes = len(dataset.label2int) maxlen = max([len(seq) for seq in x]) x = pad_sequences(x, maxlen=maxlen) y = to_categorical(y, classes) print 'x shape:', x.shape print 'y shape:', y.shape print 'number of features:', len(dataset.token2int) f1_scores = [] kf = KFold(n_splits=5, shuffle=True, random_state=100) for train_indices, test_indices in kf.split(x):
from keras.models import Sequential from keras.layers.core import Dense, Activation from keras.layers import GlobalAveragePooling1D from keras.layers.embeddings import Embedding from keras.layers import Conv1D, GlobalMaxPooling1D import dataset if __name__ == "__main__": cfg = ConfigParser.ConfigParser() cfg.read(sys.argv[1]) base = os.environ['DATA_ROOT'] train_dir = os.path.join(base, cfg.get('data', 'train')) code_file = os.path.join(base, cfg.get('data', 'codes')) dataset = dataset.DatasetProvider(train_dir, code_file) x, y = dataset.load() train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.20) maxlen = max([len(seq) for seq in train_x]) # turn x into numpy array among other things classes = len(dataset.code2int) train_x = pad_sequences(train_x, maxlen=maxlen) test_x = pad_sequences(test_x, maxlen=maxlen) train_y = np.array(train_y) test_y = np.array(test_y) print 'train_x shape:', train_x.shape print 'train_y shape:', train_y.shape print 'test_x shape:', test_x.shape print 'test_y shape:', test_y.shape print 'unique features:', len(dataset.token2int)
if __name__ == "__main__": # fyi this is a global variable now cfg = configparser.ConfigParser() cfg.read(sys.argv[1]) base = os.environ['DATA_ROOT'] train_dir = os.path.join(base, cfg.get('data', 'train')) code_file = os.path.join(base, cfg.get('data', 'codes')) provider = dataset.DatasetProvider(train_dir, code_file, cfg.getint('args', 'min_token_freq'), cfg.getint('args', 'max_tokens_in_file'), cfg.getint('args', 'min_examples_per_code'), use_cuis=False) x, y = provider.load(tokens_as_set=False) maxlen = max([len(seq) for seq in x]) x = pad_sequences(x, maxlen=maxlen) y = np.array(y) print('x shape:', x.shape) print('y shape:', y.shape) print('max seq len:', maxlen) print('vocab size:', x.max() + 1) print('number of features:', len(provider.token2int)) print('number of labels:', len(provider.code2int))
import dataset from keras.preprocessing import sequence from keras.models import Sequential from keras.layers.core import Dense, Dropout, Activation from keras.layers.embeddings import Embedding from keras.layers.recurrent import LSTM, SimpleRNN, GRU NFOLDS = 10 BATCH = 50 EPOCHS = 5 EMBDIMS = 300 if __name__ == "__main__": dataset = dataset.DatasetProvider() x, y = dataset.load() print 'x shape:', x.shape print 'y shape:', y.shape scores = [] folds = sk.cross_validation.KFold(len(y), n_folds=NFOLDS, shuffle=True) for fold_num, (train_indices, test_indices) in enumerate(folds): train_x = x[train_indices] train_y = y[train_indices] test_x = x[test_indices] test_y = y[test_indices] model = k.models.Sequential() model.add(LSTM(128, input_length=205845, input_dim=300))
from keras.models import Sequential from keras.layers import Merge, LSTM from keras.layers.core import Dense, Dropout, Activation, Flatten from keras.layers.convolutional import Convolution1D, MaxPooling1D from keras.layers.embeddings import Embedding import dataset import ConfigParser if __name__ == "__main__": cfg = ConfigParser.ConfigParser() cfg.read('settings.ini') # learn alphabet from training data dataset = dataset.DatasetProvider( [cfg.get('data', 'train'), cfg.get('data', 'test')]) # now load training examples and labels train_x, train_y = dataset.load(cfg.get('data', 'train')) # now load test examples and labels test_x, test_y = dataset.load(cfg.get('data', 'test')) # turn x and y into numpy array among other things maxlen = max([len(seq) for seq in train_x + test_x]) classes = len(set(train_y)) train_x = sequence.pad_sequences(train_x, maxlen=maxlen) train_y = k.utils.np_utils.to_categorical(np.array(train_y), classes) test_x = sequence.pad_sequences(test_x, maxlen=maxlen) test_y = k.utils.np_utils.to_categorical(np.array(test_y), classes) print 'train_x shape:', train_x.shape
# settings file specified as command-line argument cfg = ConfigParser.ConfigParser() cfg.read(sys.argv[1]) print 'train:', cfg.get('data', 'train') print 'test:', cfg.get('data', 'test') print 'batch:', cfg.get('cnn', 'batch') print 'epochs:', cfg.get('cnn', 'epochs') print 'embdims:', cfg.get('cnn', 'embdims') print 'filters:', cfg.get('cnn', 'filters') print 'filtlen:', cfg.get('cnn', 'filtlen') print 'hidden:', cfg.get('cnn', 'hidden') print 'dropout:', cfg.get('cnn', 'dropout') print 'learnrt:', cfg.get('cnn', 'learnrt') # learn alphabets from training examples dataset = dataset.DatasetProvider(cfg.get('data', 'train')) # now load training examples and labels train_x1, train_x2, train_y = dataset.load(cfg.get('data', 'train')) maxlen = max([len(seq) for seq in train_x1]) # now load test examples and labels test_x1, test_x2, test_y = dataset.load(cfg.get('data', 'test'), maxlen=maxlen) init_vectors = None # TODO: what what are we doing for index 0 (oov words)? # use pre-trained word embeddings? if cfg.has_option('data', 'embed'): print 'embeddings:', cfg.get('data', 'embed') word2vec = word2vec_model.Model(cfg.get('data', 'embed')) init_vectors = [word2vec.select_vectors(dataset.word2int)]