def classify(): code_type = 'INJ_BODY_PART_CD' raw_train, raw_test = get_data(n_train=10000, n_test=1000) raw_train_labels = raw_train[code_type] labeler = LabelEncoder() labels = set(raw_train[code_type].tolist() + raw_test[code_type].tolist()) labeler.fit(list(labels)) y_train = labeler.transform(raw_train_labels) print y_train.shape nb_classes = len(set(labels)) print('nb_classes = %s' % nb_classes) Y_train = np_utils.to_categorical(y_train, nb_classes) simple_model = get_simple_model() X_train = vectorize(raw_train, simple_model) print X_train.shape n_features = int(X_train.shape[1]) X_train = make_4d(X_train) print X_train.shape print type(X_train) BATCH_SIZE = 16 INPUT_SIZE = n_features FIELD_SIZE = 5 * 300 STRIDE = 300 N_FILTERS = 100 model = Sequential() model.add(Convolution2D(nb_filter=N_FILTERS, stack_size=1, nb_row=FIELD_SIZE, nb_col=1, subsample=(STRIDE, 1))) model.add(Activation('relu')) model.add(MaxPooling2D(poolsize=(((INPUT_SIZE - FIELD_SIZE)/STRIDE) + 1, 1))) model.add(Dropout(0.5)) model.add(Flatten()) #model.add(BatchNormalization(N_FILTERS)) model.add(Dense(N_FILTERS, nb_classes)) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer='adadelta') model.fit(X_train, Y_train, nb_epoch=10, batch_size=BATCH_SIZE, verbose=1, show_accuracy=True, validation_split=0.1) raw_test_labels = raw_test[code_type] X_test = vectorize(raw_test, simple_model) X_test = make_4d(X_test) y_test = labeler.transform(raw_test_labels) Y_test = np_utils.to_categorical(y_test, nb_classes) score = model.evaluate(X_test, Y_test, batch_size=BATCH_SIZE, show_accuracy=True) print score return model, X_test
if (start + batch_size) > n_samples: print 'reshuffling, %s + %s > %s' % (start, batch_size, n_samples) remaining_X = X[start: start + batch_size] remaining_Y = Y[start: start + batch_size] random_index = np.random.permutation(n_samples) X = np.vstack((remaining_X, X[random_index, :])) Y = np.vstack((remaining_Y, Y[random_index, :])) start = 0 n_samples = len(X) yield x_out, y_out max_len = 100 gen_batch_size = 20 code_type = 'ACTIVITY_CD' raw_train, raw_valid, raw_test = get_data(n_train=47500, n_valid=2500, n_test=10000) raw_train_labels = raw_train[code_type] raw_valid_labels = raw_valid[code_type] raw_test_labels = raw_test[code_type] labeler = LabelEncoder() labels = set(raw_train[code_type].tolist() + raw_test[code_type].tolist()) labeler.fit(list(labels)) nb_classes = len(set(labels)) print('nb_classes = %s' % nb_classes) y_train = labeler.transform(raw_train_labels) Y_train = np_utils.to_categorical(y_train, nb_classes) y_valid = labeler.transform(raw_valid_labels) Y_valid = np_utils.to_categorical(y_valid, nb_classes)
from keras.utils import np_utils from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences from keras.regularizers import l2 from embed_preprocessing import get_initial_embeddings from msha_extractor import get_data n_filters = 400 max_len = 100 embedding_size = 300 batch_size = 128 code_type = 'ACTIVITY_CD' raw_train, raw_test = get_data(n_train=50000, n_test=10000) raw_train_labels = raw_train[code_type] labeler = LabelEncoder() labels = set(raw_train[code_type].tolist() + raw_test[code_type].tolist()) labeler.fit(list(labels)) nb_classes = len(set(labels)) print('nb_classes = %s' % nb_classes) y_train = labeler.transform(raw_train_labels) print 'y_train shape is:', y_train.shape print 'Vectorizing labels' Y_train = np_utils.to_categorical(y_train, nb_classes) print 'y_train shape is:', Y_train.shape raw_test_labels = raw_test[code_type] y_test = labeler.transform(raw_test_labels)
from keras.utils import np_utils from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences from keras.regularizers import l2 from embed_preprocessing import get_initial_embeddings from msha_extractor import get_data n_filters = 400 max_len = 100 embedding_size = 300 batch_size = 128 code_type = 'ACTIVITY_CD' raw_train, raw_test = get_data(n_train=50000, n_test=10000) raw_train_labels = raw_train[code_type] labeler = LabelEncoder() labels = set(raw_train[code_type].tolist() + raw_test[code_type].tolist()) labeler.fit(list(labels)) nb_classes = len(set(labels)) print('nb_classes = %s' % nb_classes) y_train = labeler.transform(raw_train_labels) print 'y_train shape is:', y_train.shape print 'Vectorizing labels' Y_train = np_utils.to_categorical(y_train, nb_classes) print 'y_train shape is:', Y_train.shape raw_test_labels = raw_test[code_type] y_test = labeler.transform(raw_test_labels)
remaining_Y = Y[start:start + batch_size] random_index = np.random.permutation(n_samples) X = np.vstack((remaining_X, X[random_index, :])) Y = np.vstack((remaining_Y, Y[random_index, :])) start = 0 n_samples = len(X) yield {'input': X_out, 'output': Y_out} max_len = 100 gen_batch_size = 10 checkpoint_dir = r'C:\Users\ameasure\Desktop\Programming Projects\cnn\checkpoints' code_type = 'ACTIVITY_CD' raw_train, raw_valid, raw_test = get_data(n_train=47500, n_valid=2500, n_test=10000) raw_train_labels = raw_train[code_type] raw_valid_labels = raw_valid[code_type] raw_test_labels = raw_test[code_type] labeler = LabelEncoder() labels = set(raw_train[code_type].tolist() + raw_test[code_type].tolist()) labeler.fit(list(labels)) nb_classes = len(set(labels)) print('nb_classes = %s' % nb_classes) y_train = labeler.transform(raw_train_labels) Y_train = np_utils.to_categorical(y_train, nb_classes) y_valid = labeler.transform(raw_valid_labels)
def make_and_save_simple_model(): train, test = get_data(n_train=100000000, n_test=0) vocabulary = get_vocabulary(train) model = get_model() simple_model = make_simple_model(model=model, vocabulary=vocabulary) pickle.dump(simple_model, open('simple_model.pi', 'wb'))
import nltk from msha_extractor import get_data from attention import Attention max_char_seq = 15 max_word_seq = 15 # cut texts after this number of words (among top max_features most common words) max_sent_seq = 4 word_lstm_hidden = 128 sent_lstm_hidden = 128 doc_lstm_hidden = 128 embedding_dim = 16 batch_size = 80 print('Loading data...') raw_train, raw_valid, _ = get_data(n_train=50000, n_valid=1000, n_test=10000) #def remove_new_labels(df, label_field, new_labels): # print('df has %s rows' % len(df)) # df = df[~df[label_field].isin(new_labels)] # print('reduced to %s rows' % len(df)) # return df # #new_labels = ['005', '011', '045', '068'] #raw_train = remove_new_labels(raw_train, 'ACTIVITY_CD', new_labels) #raw_valid = remove_new_labels(raw_valid, 'ACTIVITY_CD', new_labels) tokenizer = Tokenizer(char_level=True) tokenizer.fit_on_texts(raw_train['NARRATIVE']) max_features = len(tokenizer.word_index) print('max_features', max_features)