def main(args):
    if len(args) < 1:
        sys.stderr.write("Error - one required arguments: <data directory>\n")
        sys.exit(-1)

    working_dir = args[0]

    print("Reading data...")
    Y, label_alphabet, X_array, feature_alphabet = ctk_io.read_token_sequence_data(working_dir)

    Y_array = np.array(Y)
    # print("Shape of X is %s and Y is %s" % (str(X.shape), str(Y.shape)))

    num_examples, dimension = X_array.shape
    num_outputs = 1 if len(label_alphabet) == 2 else len(label_alphabet)
    num_y_examples = len(Y)

    assert num_examples == num_y_examples

    Y_adj, indices = ctk_io.flatten_outputs(Y_array)

    train_x, valid_x, train_y, valid_y = train_test_split(X_array, Y_array, test_size=0.2, random_state=18)
    optim = RandomSearch(
        lambda: get_random_config(),
        lambda x, y: run_one_eval(x, y, train_x, train_y, valid_x, valid_y, len(feature_alphabet), num_outputs),
    )
    best_config = optim.optimize()

    print("Best config: %s" % best_config)
示例#2
0
def main(args):
    if len(args) < 1:
        sys.stderr.write("Error - one required arguments: <data directory>\n")
        sys.exit(-1)

    working_dir = args[0]

    print("Reading data...")
    Y, label_alphabet, X_array, feature_alphabet = ctk_io.read_token_sequence_data(
        working_dir)

    Y_array = np.array(Y)
    #print("Shape of X is %s and Y is %s" % (str(X.shape), str(Y.shape)))

    num_examples, dimension = X_array.shape
    num_outputs = 1 if len(label_alphabet) == 2 else len(label_alphabet)
    num_y_examples = len(Y)

    assert num_examples == num_y_examples

    Y_adj, indices = ctk_io.flatten_outputs(Y_array)

    train_x, valid_x, train_y, valid_y = train_test_split(X_array,
                                                          Y_array,
                                                          test_size=0.2,
                                                          random_state=18)
    optim = RandomSearch(
        lambda: get_random_config(),
        lambda x, y: run_one_eval(x, y, train_x, train_y, valid_x, valid_y,
                                  len(feature_alphabet), num_outputs))
    best_config = optim.optimize()

    print("Best config: %s" % best_config)
def main(args):
    if len(args) < 1:
        sys.stderr.write("Error - one required arguments: <data directory>\n")
        sys.exit(-1)

    working_dir = args[0]

    print("Reading data...")
    Y, label_alphabet, X_array, feature_alphabet = ctk_io.read_token_sequence_data(working_dir)
    
    X_segments, dimensions = split_entity_data(X_array, feature_alphabet)
    Y_array = np.array(Y)
    Y_adj, indices = ctk_io.flatten_outputs(Y_array)
    
    num_outputs = 1 if len(label_alphabet) == 2 else len(label_alphabet)
    num_y_examples = len(Y)
    
    train_x0, valid_x0, train_x1, valid_x1, train_x2, valid_x2, train_y, valid_y = train_test_split(X_segments[0], X_segments[1], X_segments[2], Y_array, test_size=0.2, random_state=18)
    train_x = [train_x0, train_x1, train_x2]
    valid_x = [valid_x0, valid_x1, valid_x2]
    
    optim = RandomSearch(lambda: get_random_config(), lambda x, y: run_one_eval(x, y, train_x, train_y, valid_x, valid_y, len(feature_alphabet), num_outputs ) )
    best_config = optim.optimize()

    print("Best config: %s" % best_config)
def main(args):
    if len(args) < 1:
        sys.stderr.write("Error - one required arguments: <data directory>\n")
        sys.exit(-1)

    working_dir = args[0]

    print("Reading data...")
    Y, label_alphabet, X_array, feature_alphabet = ctk_io.read_token_sequence_data(
        working_dir)

    X_segments, dimensions = split_entity_data(X_array, feature_alphabet)
    Y_array = np.array(Y)
    Y_adj, indices = ctk_io.flatten_outputs(Y_array)

    num_outputs = 1 if len(label_alphabet) == 2 else len(label_alphabet)
    num_y_examples = len(Y)

    train_x0, valid_x0, train_x1, valid_x1, train_x2, valid_x2, train_y, valid_y = train_test_split(
        X_segments[0],
        X_segments[1],
        X_segments[2],
        Y_array,
        test_size=0.2,
        random_state=18)
    train_x = [train_x0, train_x1, train_x2]
    valid_x = [valid_x0, valid_x1, valid_x2]

    optim = RandomSearch(
        lambda: get_random_config(),
        lambda x, y: run_one_eval(x, y, train_x, train_y, valid_x, valid_y,
                                  len(feature_alphabet), num_outputs))
    best_config = optim.optimize()

    print("Best config: %s" % best_config)
def main(args):
    if len(args) < 1:
        sys.stderr.write("Error - one required argument: <data directory>\n")
        sys.exit(-1)

    working_dir = args[0]

    (train_y, label_alphabet, train_x, feats_alphabet) = ctk_io.read_token_sequence_data(working_dir)


    init_vectors = None #used for pre-trained embeddings

    #load embeddings file
    embedingFile = '/Users/chenlin/Programming/ctakesWorkspace/neural-temporal/src/main/resources/org/apache/ctakes/temporal/thyme_word2vec_timex_50.vec'
    weights = ctk_io.read_embeddings(embedingFile, feats_alphabet)
    # if len(args) > 1 and best_config['pretrain'] == True:
    #     weights = ctk_io.read_embeddings(args[1], feats_alphabet)
    # elif best_config['pretrain'] and len(args) == 1:
    #     sys.stderr.write("Error: Pretrain specified but no weights file given!")
    #     sys.exit(-1)
    
    # turn x and y into numpy array among other things
    maxlen = max([len(seq) for seq in train_x])
    outcomes = set(train_y)
    classes = len(outcomes)

    train_x = pad_sequences(train_x, maxlen=maxlen)
    train_y = to_categorical(np.array(train_y), classes)

    #pickle.dump(maxlen, open(os.path.join(working_dir, 'maxlen.p'),"wb"))
    #pickle.dump(dataset1.alphabet, open(os.path.join(working_dir, 'alphabet.p'),"wb"))
    #test_x = pad_sequences(test_x, maxlen=maxlen)
    #test_y = to_categorical(np.array(test_y), classes)

    print 'train_x shape:', train_x.shape
    print 'train_y shape:', train_y.shape

    branches = [] # models to be merged
    train_xs = [] # train x for each branch
    #test_xs = []  # test x for each branch

    filtlens = "1,2,3,4,5"
    for filter_len in filtlens.split(','):
        branch = Sequential()
        branch.add(Embedding(len(feats_alphabet),
                         weights.shape[1],
                         input_length=maxlen,
                         weights=[weights],
                         trainable = False))
        branch.add(Convolution1D(nb_filter=200,
                             filter_length=int(filter_len),
                             border_mode='valid',
                             activation='relu',
                             subsample_length=1))
        branch.add(MaxPooling1D(pool_length=2))
        branch.add(Flatten())

        branches.append(branch)
        train_xs.append(train_x)
        #test_xs.append(test_x)
    branch = Sequential()
    branch.add(Embedding(len(feats_alphabet),
                         weights.shape[1],
                         input_length=maxlen,
                         weights=[weights],
                         trainable = False))
    branch.add(Convolution1D(nb_filter=200,
                             filter_length=3,
                             border_mode='valid',
                             activation='relu',
                             subsample_length=1))
    branch.add(Convolution1D(nb_filter=200,
                             filter_length=3,
                             border_mode='same',
                             activation='relu',
                             subsample_length=1))
    branch.add(MaxPooling1D(pool_length=2))
    branch.add(Flatten())

    branches.append(branch)
    train_xs.append(train_x)

    model = Sequential()
    model.add(Merge(branches, mode='concat'))

    model.add(Dense(250))#cfg.getint('cnn', 'hidden')))
    model.add(Dropout(0.25))#cfg.getfloat('cnn', 'dropout')))
    model.add(Activation('relu'))

    model.add(Dropout(0.25))#cfg.getfloat('cnn', 'dropout')))
    model.add(Dense(classes))
    model.add(Activation('softmax'))

    optimizer = RMSprop(lr=0.0001,#cfg.getfloat('cnn', 'learnrt'),
                      rho=0.9, epsilon=1e-08)
    model.compile(loss='categorical_crossentropy',
                optimizer=optimizer,
                metrics=['accuracy'])
    model.fit(train_xs,
            train_y,
            nb_epoch=20,#cfg.getint('cnn', 'epochs'),
            batch_size=50,#cfg.getint('cnn', 'batches'),
            verbose=1,
            validation_split=0.1,
            class_weight=None)

    model.summary()

    json_string = model.to_json()
    open(os.path.join(working_dir, 'model_0.json'), 'w').write(json_string)
    model.save_weights(os.path.join(working_dir, 'model_0.h5'), overwrite=True)

    fn = open(os.path.join(working_dir, 'alphabets.pkl'), 'w')
    pickle.dump( (feats_alphabet, label_alphabet, maxlen), fn)
    fn.close()

    with ZipFile(os.path.join(working_dir, 'script.model'), 'w') as myzip:
        myzip.write(os.path.join(working_dir, 'model_0.json'), 'model_0.json')
        myzip.write(os.path.join(working_dir, 'model_0.h5'), 'model_0.h5')
        myzip.write(os.path.join(working_dir, 'alphabets.pkl'), 'alphabets.pkl')

    sys.exit(0)
示例#6
0
def main(args):
    if len(args) < 1:
        sys.stderr.write("Error - one required argument: <working_dir>\n")
        sys.exit(-1)

    working_dir = args[0]
    
    ### Extract existing model:
    print("Extracting existing model")
    with ZipFile(os.path.join(working_dir, 'script.model'), 'r') as myzip:
        myzip.extract('model.h5', working_dir)
        myzip.extract('alphabets.pkl', working_dir)

    (feature_alphabet, label_alphabet) = pickle.load( open(os.path.join(working_dir, 'alphabets.pkl'), 'r' ) )
    label_lookup = {val:key for (key,val) in label_alphabet.iteritems()}
    model = load_model(os.path.join(working_dir, "model.h5"))
    #config = model.get_config()
    
    #model = Container.from_config(config)
    
    ## Find the model params needed by CNN method and get a cnn with one extra FC layer:
    # nn_models.get_cnn_model(X_array.shape, len(feature_alphabet), num_outputs, conv_layers=convs, fc_layers=layers, 
    #                                    embed_dim=embed_dim, filter_widths=width)
    print("Building new model with extra layer")
    convs = []
    dense = []
    for layer in model.layers:
        if 'convolution' in layer.name:
            convs.append(layer)
        if 'dense' in layer.name:
            dense.append(layer)
            
    filters = [x.filter_length for x in convs]
    nb_filters = (convs[0].nb_filter,)
    fc_widths = [x.output_dim for x in dense]
    fc_widths.append(fc_widths[-1] //2)
    
    new_model = nn_models.get_cnn_model(model.layers[0].input_shape, model.layers[1].input_dim, model.layers[-1].output_dim, 
                              conv_layers=nb_filters, fc_layers=fc_widths, embed_dim=model.layers[1].output_dim, filter_widths=filters )
    
    ## Just so i don't accidentally try to refer to this later
    del model
    
    ## Change the name of the output layer so that we don't try to read those weights in -- we will have a different number of parameters:
    #new_model.layers[-1].name = "NewOutput"
    
    ## Load as many weights as possible taking advantage of consistently named layers:
    new_model.load_weights(os.path.join(working_dir, "model.h5"), by_name=True)
    

    ## Re-load data and retrain model:
    print("Reading data...")
    Y, label_alphabet, X_array, feature_alphabet = ctk_io.read_token_sequence_data(working_dir)
    
    Y_array = np.array(Y)
    #print("Shape of X is %s and Y is %s" % (str(X.shape), str(Y.shape)))
    
    num_examples, dimension = X_array.shape
    num_outputs = 1 if len(label_alphabet) == 2 else len(label_alphabet)
    num_y_examples = len(Y)
    
    Y_adj, indices = ctk_io.flatten_outputs(Y_array)
    out_counts = Y_adj.sum(0)
        
    stopper = nn_models.get_early_stopper()
    
    print("Retraining model")
    new_model.fit(X_array, Y_adj,
                  nb_epoch=nb_epoch,
                  batch_size=batch_size,
                  verbose=1,
                  validation_split=0.2) #,
                  #callbacks=[stopper]) #,
                  #class_weight=class_weights)
                  
    new_model.summary()
    
    new_model.save(os.path.join(working_dir, 'new_model.h5'), overwrite=True)
    
    with ZipFile(os.path.join(working_dir, 'extended.model'), 'w') as myzip:
        myzip.write(os.path.join(working_dir, 'new_model.h5'), 'model.h5')
        myzip.write(os.path.join(working_dir, 'alphabets.pkl'), 'alphabets.pkl')
示例#7
0
def main(args):
    if len(args) < 1:
        sys.stderr.write("Error - one required argument: <working_dir>\n")
        sys.exit(-1)

    working_dir = args[0]

    ### Extract existing model:
    print("Extracting existing model")
    with ZipFile(os.path.join(working_dir, 'script.model'), 'r') as myzip:
        myzip.extract('model.h5', working_dir)
        myzip.extract('alphabets.pkl', working_dir)

    (feature_alphabet, label_alphabet) = pickle.load(
        open(os.path.join(working_dir, 'alphabets.pkl'), 'r'))
    label_lookup = {val: key for (key, val) in label_alphabet.iteritems()}
    model = load_model(os.path.join(working_dir, "model.h5"))
    #config = model.get_config()

    #model = Container.from_config(config)

    ## Find the model params needed by CNN method and get a cnn with one extra FC layer:
    # nn_models.get_cnn_model(X_array.shape, len(feature_alphabet), num_outputs, conv_layers=convs, fc_layers=layers,
    #                                    embed_dim=embed_dim, filter_widths=width)
    print("Building new model with extra layer")
    convs = []
    dense = []
    for layer in model.layers:
        if 'convolution' in layer.name:
            convs.append(layer)
        if 'dense' in layer.name:
            dense.append(layer)

    filters = [x.filter_length for x in convs]
    nb_filters = (convs[0].nb_filter, )
    fc_widths = [x.output_dim for x in dense]
    #fc_widths.append(fc_widths[-1] //2)
    fc_widths.append(fc_widths[-1])

    new_model = nn_models.get_cnn_model(model.layers[0].input_shape,
                                        model.layers[1].input_dim,
                                        model.layers[-1].output_dim,
                                        conv_layers=nb_filters,
                                        fc_layers=fc_widths,
                                        embed_dim=model.layers[1].output_dim,
                                        filter_widths=filters)

    ## Just so i don't accidentally try to refer to this later
    del model

    ## Change the name of the output layer so that we don't try to read those weights in -- we will have a different number of parameters:
    #new_model.layers[-1].name = "NewOutput"

    ## Load as many weights as possible taking advantage of consistently named layers:
    new_model.load_weights(os.path.join(working_dir, "model.h5"), by_name=True)

    ## Re-load data and retrain model:
    print("Reading data...")
    Y, label_alphabet, X_array, feature_alphabet = ctk_io.read_token_sequence_data(
        working_dir)

    Y_array = np.array(Y)
    #print("Shape of X is %s and Y is %s" % (str(X.shape), str(Y.shape)))

    num_examples, dimension = X_array.shape
    num_outputs = 1 if len(label_alphabet) == 2 else len(label_alphabet)
    num_y_examples = len(Y)

    Y_adj, indices = ctk_io.flatten_outputs(Y_array)
    out_counts = Y_adj.sum(0)

    stopper = nn_models.get_early_stopper()

    print("Retraining model")
    new_model.fit(X_array,
                  Y_adj,
                  nb_epoch=nb_epoch,
                  batch_size=batch_size,
                  verbose=1,
                  validation_split=0.2)  #,
    #callbacks=[stopper]) #,
    #class_weight=class_weights)

    new_model.summary()

    new_model.save(os.path.join(working_dir, 'new_model.h5'), overwrite=True)

    with ZipFile(os.path.join(working_dir, 'extended.model'), 'w') as myzip:
        myzip.write(os.path.join(working_dir, 'new_model.h5'), 'model.h5')
        myzip.write(os.path.join(working_dir, 'alphabets.pkl'),
                    'alphabets.pkl')
示例#8
0
def main(args):
    if len(args) < 1:
        sys.stderr.write("Error - one required argument: <data directory>\n")
        sys.exit(-1)

    working_dir = args[0]

    (train_y, label_alphabet, train_x,
     feats_alphabet) = ctk_io.read_token_sequence_data(working_dir)

    init_vectors = None  #used for pre-trained embeddings

    # turn x and y into numpy array among other things
    maxlen = max([len(seq) for seq in train_x])
    outcomes = set(train_y)
    classes = len(outcomes)

    train_x = pad_sequences(train_x, maxlen=maxlen)
    train_y = to_categorical(np.array(train_y), classes)

    #pickle.dump(maxlen, open(os.path.join(working_dir, 'maxlen.p'),"wb"))
    #pickle.dump(dataset1.alphabet, open(os.path.join(working_dir, 'alphabet.p'),"wb"))
    #test_x = pad_sequences(test_x, maxlen=maxlen)
    #test_y = to_categorical(np.array(test_y), classes)

    print 'train_x shape:', train_x.shape
    print 'train_y shape:', train_y.shape

    #branches = [] # models to be merged
    #train_xs = [] # train x for each branch
    #test_xs = []  # test x for each branch

    model = resnet(maxlen, feats_alphabet, classes)

    optimizer = RMSprop(
        lr=0.0001,  #cfg.getfloat('cnn', 'learnrt'),
        rho=0.9,
        epsilon=1e-08)
    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])  #{'0':'accuracy'})#
    model.fit(
        train_x,
        train_y,
        nb_epoch=10,  #cfg.getint('cnn', 'epochs'),
        batch_size=50,  #cfg.getint('cnn', 'batches'),
        verbose=1,
        validation_split=0.1,
        class_weight=None)

    model.summary()

    json_string = model.to_json()
    open(os.path.join(working_dir, 'model_0.json'), 'w').write(json_string)
    model.save_weights(os.path.join(working_dir, 'model_0.h5'), overwrite=True)

    fn = open(os.path.join(working_dir, 'alphabets.pkl'), 'w')
    pickle.dump((feats_alphabet, label_alphabet, maxlen), fn)
    fn.close()

    with ZipFile(os.path.join(working_dir, 'script.model'), 'w') as myzip:
        myzip.write(os.path.join(working_dir, 'model_0.json'), 'model_0.json')
        myzip.write(os.path.join(working_dir, 'model_0.h5'), 'model_0.h5')
        myzip.write(os.path.join(working_dir, 'alphabets.pkl'),
                    'alphabets.pkl')
    sys.exit(0)
示例#9
0
def main(args):
    if len(args) < 1:
        sys.stderr.write("Error - one required argument: <data directory>\n")
        sys.exit(-1)

    working_dir = args[0]

    (train_y, label_alphabet, train_x, feats_alphabet) = ctk_io.read_token_sequence_data(working_dir)

    init_vectors = None #used for pre-trained embeddings
    
    # turn x and y into numpy array among other things
    maxlen = max([len(seq) for seq in train_x])
    outcomes = set(train_y)
    classes = len(outcomes)

    train_x = pad_sequences(train_x, maxlen=maxlen)
    train_y = to_categorical(np.array(train_y), classes)

    #pickle.dump(maxlen, open(os.path.join(working_dir, 'maxlen.p'),"wb"))
    #pickle.dump(dataset1.alphabet, open(os.path.join(working_dir, 'alphabet.p'),"wb"))
    #test_x = pad_sequences(test_x, maxlen=maxlen)
    #test_y = to_categorical(np.array(test_y), classes)

    print 'train_x shape:', train_x.shape
    print 'train_y shape:', train_y.shape

    #branches = [] # models to be merged
    #train_xs = [] # train x for each branch
    #test_xs = []  # test x for each branch

    model   = resnet(maxlen, feats_alphabet, classes)

    optimizer = RMSprop(lr=0.0001,#cfg.getfloat('cnn', 'learnrt'),
                      rho=0.9, epsilon=1e-08)
    model.compile(loss='categorical_crossentropy',
                optimizer=optimizer,
                metrics= ['accuracy'])#{'0':'accuracy'})#
    model.fit(train_x,
            train_y,
            nb_epoch=10,#cfg.getint('cnn', 'epochs'),
            batch_size=50,#cfg.getint('cnn', 'batches'),
            verbose=1,
            validation_split=0.1,
            class_weight=None)

    model.summary()

    json_string = model.to_json()
    open(os.path.join(working_dir, 'model_0.json'), 'w').write(json_string)
    model.save_weights(os.path.join(working_dir, 'model_0.h5'), overwrite=True)

    fn = open(os.path.join(working_dir, 'alphabets.pkl'), 'w')
    pickle.dump( (feats_alphabet, label_alphabet, maxlen), fn)
    fn.close()

    with ZipFile(os.path.join(working_dir, 'script.model'), 'w') as myzip:
        myzip.write(os.path.join(working_dir, 'model_0.json'), 'model_0.json')
        myzip.write(os.path.join(working_dir, 'model_0.h5'), 'model_0.h5')
        myzip.write(os.path.join(working_dir, 'alphabets.pkl'), 'alphabets.pkl')
    sys.exit(0)