示例#1
0
def classify():
    code_type = 'INJ_BODY_PART_CD'
    raw_train, raw_test = get_data(n_train=10000, n_test=1000)
    raw_train_labels = raw_train[code_type]

    labeler = LabelEncoder()
    labels = set(raw_train[code_type].tolist() + raw_test[code_type].tolist())
    labeler.fit(list(labels))
    y_train = labeler.transform(raw_train_labels)
    print y_train.shape
    nb_classes = len(set(labels))
    print('nb_classes = %s' % nb_classes)
    Y_train = np_utils.to_categorical(y_train, nb_classes)
    
    simple_model = get_simple_model()
    X_train = vectorize(raw_train, simple_model)
    
    print X_train.shape
    n_features = int(X_train.shape[1])
    X_train = make_4d(X_train)
    print X_train.shape
    print type(X_train)
    
    BATCH_SIZE = 16
    INPUT_SIZE = n_features
    FIELD_SIZE = 5 * 300
    STRIDE = 300
    N_FILTERS = 100

    model = Sequential()
    model.add(Convolution2D(nb_filter=N_FILTERS, stack_size=1, nb_row=FIELD_SIZE, 
                            nb_col=1, subsample=(STRIDE, 1)))
    model.add(Activation('relu'))
    model.add(MaxPooling2D(poolsize=(((INPUT_SIZE - FIELD_SIZE)/STRIDE) + 1, 1)))
    model.add(Dropout(0.5))
    model.add(Flatten())
    #model.add(BatchNormalization(N_FILTERS))
    model.add(Dense(N_FILTERS, nb_classes))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adadelta')
    model.fit(X_train, Y_train, nb_epoch=10, batch_size=BATCH_SIZE, verbose=1, show_accuracy=True, validation_split=0.1)

    raw_test_labels = raw_test[code_type]
    X_test = vectorize(raw_test, simple_model)
    X_test = make_4d(X_test)
    y_test = labeler.transform(raw_test_labels)
    Y_test = np_utils.to_categorical(y_test, nb_classes)

    score = model.evaluate(X_test, Y_test, batch_size=BATCH_SIZE, show_accuracy=True)
    print score
    return model, X_test
        if (start + batch_size) > n_samples:
            print 'reshuffling, %s + %s > %s' % (start, batch_size, n_samples)
            remaining_X = X[start: start + batch_size]
            remaining_Y = Y[start: start + batch_size]
            random_index = np.random.permutation(n_samples)
            X = np.vstack((remaining_X, X[random_index, :]))
            Y = np.vstack((remaining_Y, Y[random_index, :]))
            start = 0
            n_samples = len(X)
        yield x_out, y_out

max_len = 100
gen_batch_size = 20

code_type = 'ACTIVITY_CD'
raw_train, raw_valid, raw_test = get_data(n_train=47500, n_valid=2500, n_test=10000)
raw_train_labels = raw_train[code_type]
raw_valid_labels = raw_valid[code_type]
raw_test_labels = raw_test[code_type]

labeler = LabelEncoder()
labels = set(raw_train[code_type].tolist() + raw_test[code_type].tolist())
labeler.fit(list(labels))
nb_classes = len(set(labels))
print('nb_classes = %s' % nb_classes)

y_train = labeler.transform(raw_train_labels)
Y_train = np_utils.to_categorical(y_train, nb_classes)

y_valid = labeler.transform(raw_valid_labels)
Y_valid = np_utils.to_categorical(y_valid, nb_classes)
from keras.utils import np_utils
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.regularizers import l2

from embed_preprocessing import get_initial_embeddings
from msha_extractor import get_data


n_filters = 400
max_len = 100
embedding_size = 300
batch_size = 128

code_type = 'ACTIVITY_CD'
raw_train, raw_test = get_data(n_train=50000, n_test=10000)
raw_train_labels = raw_train[code_type]

labeler = LabelEncoder()
labels = set(raw_train[code_type].tolist() + raw_test[code_type].tolist())
labeler.fit(list(labels))
nb_classes = len(set(labels))
print('nb_classes = %s' % nb_classes)
y_train = labeler.transform(raw_train_labels)
print 'y_train shape is:', y_train.shape
print 'Vectorizing labels'
Y_train = np_utils.to_categorical(y_train, nb_classes)
print 'y_train shape is:', Y_train.shape

raw_test_labels = raw_test[code_type]
y_test = labeler.transform(raw_test_labels)
示例#4
0
from keras.utils import np_utils
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.regularizers import l2

from embed_preprocessing import get_initial_embeddings
from msha_extractor import get_data


n_filters = 400
max_len = 100
embedding_size = 300
batch_size = 128

code_type = 'ACTIVITY_CD'
raw_train, raw_test = get_data(n_train=50000, n_test=10000)
raw_train_labels = raw_train[code_type]

labeler = LabelEncoder()
labels = set(raw_train[code_type].tolist() + raw_test[code_type].tolist())
labeler.fit(list(labels))
nb_classes = len(set(labels))
print('nb_classes = %s' % nb_classes)
y_train = labeler.transform(raw_train_labels)
print 'y_train shape is:', y_train.shape
print 'Vectorizing labels'
Y_train = np_utils.to_categorical(y_train, nb_classes)
print 'y_train shape is:', Y_train.shape

raw_test_labels = raw_test[code_type]
y_test = labeler.transform(raw_test_labels)
            remaining_Y = Y[start:start + batch_size]
            random_index = np.random.permutation(n_samples)
            X = np.vstack((remaining_X, X[random_index, :]))
            Y = np.vstack((remaining_Y, Y[random_index, :]))
            start = 0
            n_samples = len(X)
        yield {'input': X_out, 'output': Y_out}


max_len = 100
gen_batch_size = 10
checkpoint_dir = r'C:\Users\ameasure\Desktop\Programming Projects\cnn\checkpoints'

code_type = 'ACTIVITY_CD'
raw_train, raw_valid, raw_test = get_data(n_train=47500,
                                          n_valid=2500,
                                          n_test=10000)
raw_train_labels = raw_train[code_type]
raw_valid_labels = raw_valid[code_type]
raw_test_labels = raw_test[code_type]

labeler = LabelEncoder()
labels = set(raw_train[code_type].tolist() + raw_test[code_type].tolist())
labeler.fit(list(labels))
nb_classes = len(set(labels))
print('nb_classes = %s' % nb_classes)

y_train = labeler.transform(raw_train_labels)
Y_train = np_utils.to_categorical(y_train, nb_classes)

y_valid = labeler.transform(raw_valid_labels)
示例#6
0
def make_and_save_simple_model():
    train, test = get_data(n_train=100000000, n_test=0)
    vocabulary = get_vocabulary(train)
    model = get_model()
    simple_model = make_simple_model(model=model, vocabulary=vocabulary)
    pickle.dump(simple_model, open('simple_model.pi', 'wb'))
import nltk

from msha_extractor import get_data
from attention import Attention

max_char_seq = 15
max_word_seq = 15  # cut texts after this number of words (among top max_features most common words)
max_sent_seq = 4
word_lstm_hidden = 128
sent_lstm_hidden = 128
doc_lstm_hidden = 128
embedding_dim = 16
batch_size = 80

print('Loading data...')
raw_train, raw_valid, _ = get_data(n_train=50000, n_valid=1000, n_test=10000)

#def remove_new_labels(df, label_field, new_labels):
#    print('df has %s rows' % len(df))
#    df = df[~df[label_field].isin(new_labels)]
#    print('reduced to %s rows' % len(df))
#    return df
#
#new_labels = ['005', '011', '045', '068']
#raw_train = remove_new_labels(raw_train, 'ACTIVITY_CD', new_labels)
#raw_valid = remove_new_labels(raw_valid, 'ACTIVITY_CD', new_labels)
    
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(raw_train['NARRATIVE'])
max_features = len(tokenizer.word_index)
print('max_features', max_features)