def main(args):
    if len(args) < 1:
        sys.stderr.write("Error - one required argument: <data directory>\n")
        sys.exit(-1)

    working_dir = args[0]
   
    print("Reading data...")
    Y, outcome_map, outcome_list, X, feature_alphabet = ctk_io.read_multitask_token_sequence_data(working_dir)
    start_ind = feature_alphabet[start_symbol]
    end_ind = feature_alphabet[end_symbol]
    
    train_x, valid_x, train_y, valid_y = train_test_split(X, Y, test_size=0.2, random_state=7)

#    X_distance = get_distance_features(X, start_ind, end_ind)
    
    print("Shape of X is %s and Y is %s" % (str(X.shape), str(Y.shape)))
    
    num_examples, dimension = X.shape
    num_y_examples, num_labels = Y.shape
    assert num_examples == num_y_examples
    
    weights = None
    if len(args) > 1:
        weights = ctk_io.read_embeddings(args[1], feats_alphabet)
    
    train_y_adj, train_indices = ctk_io.flatten_outputs(train_y)
    valid_y_adj, valid_indices = ctk_io.flatten_outputs(valid_y)
    if not train_indices == valid_indices:
        print("Error: training and valid sets have different index sets -- may be missing some labels in one set or the other")
        sys.exit(-1)
           
    output_dims_list = []
    train_y_list = []
    valid_y_list = []
    indices = train_indices
    for i in range(len(indices)-1):
        label_dims = indices[i+1] - indices[i]
        output_dims_list.append(label_dims)
        if label_dims == 1:
            train_y_list.append(train_y_adj[:, indices[i]])
            valid_y_list.append(valid_y_adj[:, indices[i]])
        else:
            train_y_list.append(train_y_adj[:, indices[i]:indices[i+1]])
            valid_y_list.append(valid_y_adj[:, indices[i]:indices[i+1]])
        
        print("Dimensions of label %d are %s" % (i, str(train_y_list[-1].shape) ) )
    
    ## pass a function to the search that it uses to get a random config
    ## and a function that it will get an eval given (e)pochs and (c)onfig file:
    optim = RandomSearch(lambda: get_random_config(weights), lambda e, c: run_one_eval(e, c, train_x, train_y_list, valid_x, valid_y_list, len(feature_alphabet), output_dims_list, weights ) )
    best_config = optim.optimize(max_iter=27)

    open(os.path.join(working_dir, 'model_0.config'), 'w').write( str(best_config) )
    print("Best config returned by optimizer is %s" % str(best_config) )
示例#2
0
def main(args):
    if len(args) < 1:
        sys.stderr.write("Error - one required argument: <data directory>\n")
        sys.exit(-1)

    working_dir = args[0]

    #    print("Reading data...")
    Y, X = ctk_io.read_multitask_liblinear(
        working_dir)  # ('data_testing/multitask_assertion/train_and_test')
    stopper = nn_models.get_early_stopper()

    num_examples, dimension = X.shape
    num_y_examples, num_labels = Y.shape
    assert num_examples == num_y_examples

    #print("Data has %d examples and dimension %d" % (num_examples, dimension) )
    #print("Output has %d dimensions" % (num_labels) )

    X = np.reshape(X, (num_examples, 11, dimension / 11))

    Y_adj, indices = ctk_io.flatten_outputs(Y)

    #print("After reshaping the data has shape %s" % (str(X.shape)))

    for label_ind in range(0, Y.shape[1]):

        num_outputs = indices[label_ind + 1] - indices[label_ind]
        model = nn_models.get_cnn_model(X.shape, num_outputs)

        #print("For label ind %d, grabbing indices from %d to %d" % (label_ind, int(indices[label_ind]), int(indices[label_ind+1])))

        train_y = Y_adj[:, int(indices[label_ind]):int(indices[label_ind + 1])]

        #if(train_y.shape[-1] == 1):
        #    print("Number of values=1 is %d" % (train_y.sum()))

        #print("Shape of y is %s, shape of X is %s, max value in y is %f and min is %f" % (str(train_y.shape), str(X.shape), train_y.max(), train_y.min()) )

        model.fit(X,
                  train_y,
                  nb_epoch=nb_epoch,
                  batch_size=batch_size,
                  verbose=1,
                  validation_split=0.2,
                  callbacks=[stopper])

        model.summary()

        json_string = model.to_json()
        open(os.path.join(working_dir, 'model_%d.json' % label_ind),
             'w').write(json_string)
        model.save_weights(os.path.join(working_dir,
                                        'model_%d.h5' % label_ind),
                           overwrite=True)

        #print("This model has %d layers and layer 3 has %d weights" % (len(model.layers), len(model.layers[3].get_weights()) ) )
        #print("The weight of the first layer at index 50 is %f" % model.layers[3].get_weights()[50])

    sys.exit(0)
def main(args):
    if len(args) < 1:
        sys.stderr.write("Error - one required arguments: <data directory>\n")
        sys.exit(-1)

    working_dir = args[0]

    print("Reading data...")
    Y, label_alphabet, X_array, feature_alphabet = ctk_io.read_token_sequence_data(working_dir)
    
    X_segments, dimensions = split_entity_data(X_array, feature_alphabet)
    Y_array = np.array(Y)
    Y_adj, indices = ctk_io.flatten_outputs(Y_array)
    
    num_outputs = 1 if len(label_alphabet) == 2 else len(label_alphabet)
    num_y_examples = len(Y)
    
    train_x0, valid_x0, train_x1, valid_x1, train_x2, valid_x2, train_y, valid_y = train_test_split(X_segments[0], X_segments[1], X_segments[2], Y_array, test_size=0.2, random_state=18)
    train_x = [train_x0, train_x1, train_x2]
    valid_x = [valid_x0, valid_x1, valid_x2]
    
    optim = RandomSearch(lambda: get_random_config(), lambda x, y: run_one_eval(x, y, train_x, train_y, valid_x, valid_y, len(feature_alphabet), num_outputs ) )
    best_config = optim.optimize()

    print("Best config: %s" % best_config)
示例#4
0
def main(args):
    if len(args) < 1:
        sys.stderr.write("Error - one required arguments: <data directory>\n")
        sys.exit(-1)

    working_dir = args[0]

    print("Reading data...")
    Y, label_alphabet, X_array, feature_alphabet = ctk_io.read_token_sequence_data(
        working_dir)

    Y_array = np.array(Y)
    #print("Shape of X is %s and Y is %s" % (str(X.shape), str(Y.shape)))

    num_examples, dimension = X_array.shape
    num_outputs = 1 if len(label_alphabet) == 2 else len(label_alphabet)
    num_y_examples = len(Y)

    assert num_examples == num_y_examples

    Y_adj, indices = ctk_io.flatten_outputs(Y_array)

    train_x, valid_x, train_y, valid_y = train_test_split(X_array,
                                                          Y_array,
                                                          test_size=0.2,
                                                          random_state=18)
    optim = RandomSearch(
        lambda: get_random_config(),
        lambda x, y: run_one_eval(x, y, train_x, train_y, valid_x, valid_y,
                                  len(feature_alphabet), num_outputs))
    best_config = optim.optimize()

    print("Best config: %s" % best_config)
def main(args):
    if len(args) < 1:
        sys.stderr.write("Error - one required arguments: <data directory>\n")
        sys.exit(-1)

    working_dir = args[0]

    print("Reading data...")
    Y, label_alphabet, X_array, feature_alphabet = ctk_io.read_token_sequence_data(working_dir)

    Y_array = np.array(Y)
    # print("Shape of X is %s and Y is %s" % (str(X.shape), str(Y.shape)))

    num_examples, dimension = X_array.shape
    num_outputs = 1 if len(label_alphabet) == 2 else len(label_alphabet)
    num_y_examples = len(Y)

    assert num_examples == num_y_examples

    Y_adj, indices = ctk_io.flatten_outputs(Y_array)

    train_x, valid_x, train_y, valid_y = train_test_split(X_array, Y_array, test_size=0.2, random_state=18)
    optim = RandomSearch(
        lambda: get_random_config(),
        lambda x, y: run_one_eval(x, y, train_x, train_y, valid_x, valid_y, len(feature_alphabet), num_outputs),
    )
    best_config = optim.optimize()

    print("Best config: %s" % best_config)
def main(args):
    if len(args) < 1:
        sys.stderr.write("Error - one required arguments: <data directory>\n")
        sys.exit(-1)

    working_dir = args[0]

    print("Reading data...")
    Y, label_alphabet, X_array, feature_alphabet = ctk_io.read_token_sequence_data(
        working_dir)

    X_segments, dimensions = split_entity_data(X_array, feature_alphabet)
    Y_array = np.array(Y)
    Y_adj, indices = ctk_io.flatten_outputs(Y_array)

    num_outputs = 1 if len(label_alphabet) == 2 else len(label_alphabet)
    num_y_examples = len(Y)

    train_x0, valid_x0, train_x1, valid_x1, train_x2, valid_x2, train_y, valid_y = train_test_split(
        X_segments[0],
        X_segments[1],
        X_segments[2],
        Y_array,
        test_size=0.2,
        random_state=18)
    train_x = [train_x0, train_x1, train_x2]
    valid_x = [valid_x0, valid_x1, valid_x2]

    optim = RandomSearch(
        lambda: get_random_config(),
        lambda x, y: run_one_eval(x, y, train_x, train_y, valid_x, valid_y,
                                  len(feature_alphabet), num_outputs))
    best_config = optim.optimize()

    print("Best config: %s" % best_config)
def main(args):
    if len(args) < 1:
        sys.stderr.write("Error - one required argument: <data directory>\n")
        sys.exit(-1)

    working_dir = args[0]
    
#    print("Reading data...")
    Y, X = ctk_io.read_multitask_liblinear(working_dir) # ('data_testing/multitask_assertion/train_and_test') 
    stopper = nn_models.get_early_stopper()
    
    num_examples, dimension = X.shape
    num_y_examples, num_labels = Y.shape
    assert num_examples == num_y_examples
    
    #print("Data has %d examples and dimension %d" % (num_examples, dimension) )
    #print("Output has %d dimensions" % (num_labels) )

    X = np.reshape(X, (num_examples, 11, dimension / 11))
    
    Y_adj, indices = ctk_io.flatten_outputs(Y)

    #print("After reshaping the data has shape %s" % (str(X.shape)))
    
    for label_ind in range(0, Y.shape[1]):
        
        num_outputs = indices[label_ind+1] - indices[label_ind]
        model = nn_models.get_cnn_model(X.shape, num_outputs)

        #print("For label ind %d, grabbing indices from %d to %d" % (label_ind, int(indices[label_ind]), int(indices[label_ind+1])))
        
        train_y = Y_adj[:, int(indices[label_ind]):int(indices[label_ind+1])]

        #if(train_y.shape[-1] == 1):
        #    print("Number of values=1 is %d" % (train_y.sum()))

        #print("Shape of y is %s, shape of X is %s, max value in y is %f and min is %f" % (str(train_y.shape), str(X.shape), train_y.max(), train_y.min()) )
        
        model.fit(X, train_y,
                  nb_epoch=nb_epoch,
                  batch_size=batch_size,
                  verbose=1,
                  validation_split=0.2,
                  callbacks=[stopper])
        
        model.summary()
        
        json_string = model.to_json()
        open(os.path.join(working_dir, 'model_%d.json' % label_ind), 'w').write(json_string)
        model.save_weights(os.path.join(working_dir, 'model_%d.h5' % label_ind), overwrite=True)
        
        #print("This model has %d layers and layer 3 has %d weights" % (len(model.layers), len(model.layers[3].get_weights()) ) )
        #print("The weight of the first layer at index 50 is %f" % model.layers[3].get_weights()[50])
        
    sys.exit(0)
示例#8
0
def main(args):
    if len(args) < 1:
        sys.stderr.write("Error - one required argument: <working_dir>\n")
        sys.exit(-1)

    working_dir = args[0]
    
    ### Extract existing model:
    print("Extracting existing model")
    with ZipFile(os.path.join(working_dir, 'script.model'), 'r') as myzip:
        myzip.extract('model.h5', working_dir)
        myzip.extract('alphabets.pkl', working_dir)

    (feature_alphabet, label_alphabet) = pickle.load( open(os.path.join(working_dir, 'alphabets.pkl'), 'r' ) )
    label_lookup = {val:key for (key,val) in label_alphabet.iteritems()}
    model = load_model(os.path.join(working_dir, "model.h5"))
    #config = model.get_config()
    
    #model = Container.from_config(config)
    
    ## Find the model params needed by CNN method and get a cnn with one extra FC layer:
    # nn_models.get_cnn_model(X_array.shape, len(feature_alphabet), num_outputs, conv_layers=convs, fc_layers=layers, 
    #                                    embed_dim=embed_dim, filter_widths=width)
    print("Building new model with extra layer")
    convs = []
    dense = []
    for layer in model.layers:
        if 'convolution' in layer.name:
            convs.append(layer)
        if 'dense' in layer.name:
            dense.append(layer)
            
    filters = [x.filter_length for x in convs]
    nb_filters = (convs[0].nb_filter,)
    fc_widths = [x.output_dim for x in dense]
    fc_widths.append(fc_widths[-1] //2)
    
    new_model = nn_models.get_cnn_model(model.layers[0].input_shape, model.layers[1].input_dim, model.layers[-1].output_dim, 
                              conv_layers=nb_filters, fc_layers=fc_widths, embed_dim=model.layers[1].output_dim, filter_widths=filters )
    
    ## Just so i don't accidentally try to refer to this later
    del model
    
    ## Change the name of the output layer so that we don't try to read those weights in -- we will have a different number of parameters:
    #new_model.layers[-1].name = "NewOutput"
    
    ## Load as many weights as possible taking advantage of consistently named layers:
    new_model.load_weights(os.path.join(working_dir, "model.h5"), by_name=True)
    

    ## Re-load data and retrain model:
    print("Reading data...")
    Y, label_alphabet, X_array, feature_alphabet = ctk_io.read_token_sequence_data(working_dir)
    
    Y_array = np.array(Y)
    #print("Shape of X is %s and Y is %s" % (str(X.shape), str(Y.shape)))
    
    num_examples, dimension = X_array.shape
    num_outputs = 1 if len(label_alphabet) == 2 else len(label_alphabet)
    num_y_examples = len(Y)
    
    Y_adj, indices = ctk_io.flatten_outputs(Y_array)
    out_counts = Y_adj.sum(0)
        
    stopper = nn_models.get_early_stopper()
    
    print("Retraining model")
    new_model.fit(X_array, Y_adj,
                  nb_epoch=nb_epoch,
                  batch_size=batch_size,
                  verbose=1,
                  validation_split=0.2) #,
                  #callbacks=[stopper]) #,
                  #class_weight=class_weights)
                  
    new_model.summary()
    
    new_model.save(os.path.join(working_dir, 'new_model.h5'), overwrite=True)
    
    with ZipFile(os.path.join(working_dir, 'extended.model'), 'w') as myzip:
        myzip.write(os.path.join(working_dir, 'new_model.h5'), 'model.h5')
        myzip.write(os.path.join(working_dir, 'alphabets.pkl'), 'alphabets.pkl')
def main(args):
    working_dir = args[0]
    print("Reading data...")
    Y, X = ctk_io.read_multitask_liblinear(working_dir)  # get_data()

    num_examples, dimension = X.shape
    num_y_examples, num_labels = Y.shape
    assert num_examples == num_y_examples

    print("Data has %d examples and dimension %d" % (num_examples, dimension))
    print("Output has %d dimensions" % (num_labels))

    Y_adj, indices = ctk_io.flatten_outputs(Y)

    print("%d labels mapped to %d outputs based on category numbers" % (Y.shape[1], Y_adj.shape[1]))

    label_scores = []

    for label_ind in range(0, Y.shape[1]):

        num_outputs = indices[label_ind + 1] - indices[label_ind]
        #        model = models.get_mlp_model(dimension, num_outputs)

        print("Starting to train for label %d with %d outputs" % (label_ind, num_outputs))

        folds = sk.cross_validation.KFold(num_examples, n_folds=num_folds)

        scores = []
        total_tp = 0
        total_fp = 0
        total_fn = 0
        fold_ind = 0
        total_score = 0

        for train_indices, test_indices in folds:
            print("Starting fold %d" % fold_ind)

            train_x = X[train_indices]
            train_y = Y_adj[train_indices, int(indices[label_ind]) : int(indices[label_ind + 1])]
            test_x = X[test_indices]
            test_y = Y_adj[test_indices, int(indices[label_ind]) : int(indices[label_ind + 1])]

            model = nn_models.get_mlp_model(dimension, num_outputs)

            model.fit(train_x, train_y, nb_epoch=nb_epoch, batch_size=batch_size)

            ### This was to test model reading/writing and it works fine.
            #             temp_dir = tempfile.mkdtemp()
            #             json_string = model.to_json()
            #             open(os.path.join(temp_dir, 'model_%d.json' % label_ind), 'w').write(json_string)
            #             model.save_weights(os.path.join(temp_dir, 'model_%d.h5' % label_ind), overwrite=True)
            #
            #             model = None
            #
            #             model = model_from_json(open(os.path.join(temp_dir, "model_%d.json" % label_ind)).read())
            #             model.load_weights(os.path.join(temp_dir, "model_%d.h5" % label_ind))

            if num_outputs == 1:
                labels = test_y
                predictions = model.predict_classes(test_x, batch_size=batch_size)
                #                labels = np.reshape(test_y, (len(test_y),1))
                ## count up true positive occurrences where prediction = label = 1 aka prediction + label == 2
                tp = len(np.where((predictions + labels) == 2)[0])
                total_tp += tp

                ## false positives: prediction - label = 1
                fp = len(np.where((predictions - labels) == 1)[0])
                total_fp += fp

                ## false negatives: label - prediction = 1
                fn = len(np.where((labels - predictions) == 1)[0])
                total_fn += fn

                print("tp=%d, fp=%d, fn=%d" % (tp, fp, fn))
                recall = tp / float(tp + fn) if tp > 0 else 0
                precision = tp / float(tp + fp) if tp > 0 else 1
                f1 = get_f(recall, precision)
                print("P=%f, R=%f, F1=%f" % (precision, recall, f1))
            else:
                score = model.evaluate(test_x, test_y, batch_size=batch_size)
                print("score=%s" % (score))
                total_score += score[1]

            #        score = model.evaluate(test_x, test_y, show_accuracy=True, batch_size=batch_size)
            #        print("Scores for fold %d:" % fold_ind)
            #        print("test score: ", score[0])
            #        print("test accuracy: " , score[1])
            fold_ind += 1

        if num_outputs == 1:
            recall = total_tp / float(total_tp + total_fn)
            precision = total_tp / float(total_tp + total_fp)
            f1 = get_f(recall, precision)
            print("Overall total: P=%f, R=%f, F=%f" % (recall, precision, f1))
            label_scores.append(f1)
        else:
            total_score /= num_folds
            print("Overall accuracy = %f" % (total_score))
            label_scores.append(total_score)

    for ind, val in enumerate(label_scores):
        print("%s of label %d is %f" % ("Fscore" if num_outputs == 2 else "Accuracy", ind, val))
示例#10
0
def main(args):
    working_dir = args[0]
    print("Reading data...")
    Y, X = ctk_io.read_multitask_liblinear(working_dir)  # get_data()

    num_examples, dimension = X.shape
    num_y_examples, num_labels = Y.shape
    assert num_examples == num_y_examples

    print("Data has %d examples and dimension %d" % (num_examples, dimension))
    print("Output has %d dimensions" % (num_labels))

    Y_adj, indices = ctk_io.flatten_outputs(Y)

    print("%d labels mapped to %d outputs based on category numbers" %
          (Y.shape[1], Y_adj.shape[1]))

    label_scores = []

    for label_ind in range(0, Y.shape[1]):

        num_outputs = indices[label_ind + 1] - indices[label_ind]
        #        model = models.get_mlp_model(dimension, num_outputs)

        print("Starting to train for label %d with %d outputs" %
              (label_ind, num_outputs))

        folds = sk.cross_validation.KFold(num_examples, n_folds=num_folds)

        scores = []
        total_tp = 0
        total_fp = 0
        total_fn = 0
        fold_ind = 0
        total_score = 0

        for train_indices, test_indices in folds:
            print("Starting fold %d" % fold_ind)

            train_x = X[train_indices]
            train_y = Y_adj[train_indices,
                            int(indices[label_ind]):int(indices[label_ind +
                                                                1])]
            test_x = X[test_indices]
            test_y = Y_adj[test_indices,
                           int(indices[label_ind]):int(indices[label_ind + 1])]

            model = nn_models.get_mlp_model(dimension, num_outputs)

            model.fit(train_x,
                      train_y,
                      nb_epoch=nb_epoch,
                      batch_size=batch_size)

            ### This was to test model reading/writing and it works fine.
            #             temp_dir = tempfile.mkdtemp()
            #             json_string = model.to_json()
            #             open(os.path.join(temp_dir, 'model_%d.json' % label_ind), 'w').write(json_string)
            #             model.save_weights(os.path.join(temp_dir, 'model_%d.h5' % label_ind), overwrite=True)
            #
            #             model = None
            #
            #             model = model_from_json(open(os.path.join(temp_dir, "model_%d.json" % label_ind)).read())
            #             model.load_weights(os.path.join(temp_dir, "model_%d.h5" % label_ind))

            if num_outputs == 1:
                labels = test_y
                predictions = model.predict_classes(test_x,
                                                    batch_size=batch_size)
                #                labels = np.reshape(test_y, (len(test_y),1))
                ## count up true positive occurrences where prediction = label = 1 aka prediction + label == 2
                tp = len(np.where((predictions + labels) == 2)[0])
                total_tp += tp

                ## false positives: prediction - label = 1
                fp = len(np.where((predictions - labels) == 1)[0])
                total_fp += fp

                ## false negatives: label - prediction = 1
                fn = len(np.where((labels - predictions) == 1)[0])
                total_fn += fn

                print("tp=%d, fp=%d, fn=%d" % (tp, fp, fn))
                recall = tp / float(tp + fn) if tp > 0 else 0
                precision = tp / float(tp + fp) if tp > 0 else 1
                f1 = get_f(recall, precision)
                print("P=%f, R=%f, F1=%f" % (precision, recall, f1))
            else:
                score = model.evaluate(test_x, test_y, batch_size=batch_size)
                print("score=%s" % (score))
                total_score += score[1]

    #        score = model.evaluate(test_x, test_y, show_accuracy=True, batch_size=batch_size)
    #        print("Scores for fold %d:" % fold_ind)
    #        print("test score: ", score[0])
    #        print("test accuracy: " , score[1])
            fold_ind += 1

        if num_outputs == 1:
            recall = total_tp / float(total_tp + total_fn)
            precision = total_tp / float(total_tp + total_fp)
            f1 = get_f(recall, precision)
            print("Overall total: P=%f, R=%f, F=%f" % (recall, precision, f1))
            label_scores.append(f1)
        else:
            total_score /= num_folds
            print("Overall accuracy = %f" % (total_score))
            label_scores.append(total_score)

    for ind, val in enumerate(label_scores):
        print("%s of label %d is %f" %
              ("Fscore" if num_outputs == 2 else "Accuracy", ind, val))
示例#11
0
def main(args):
    if len(args) < 1:
        sys.stderr.write("Error - one required argument: <working_dir>\n")
        sys.exit(-1)

    working_dir = args[0]

    ### Extract existing model:
    print("Extracting existing model")
    with ZipFile(os.path.join(working_dir, 'script.model'), 'r') as myzip:
        myzip.extract('model.h5', working_dir)
        myzip.extract('alphabets.pkl', working_dir)

    (feature_alphabet, label_alphabet) = pickle.load(
        open(os.path.join(working_dir, 'alphabets.pkl'), 'r'))
    label_lookup = {val: key for (key, val) in label_alphabet.iteritems()}
    model = load_model(os.path.join(working_dir, "model.h5"))
    #config = model.get_config()

    #model = Container.from_config(config)

    ## Find the model params needed by CNN method and get a cnn with one extra FC layer:
    # nn_models.get_cnn_model(X_array.shape, len(feature_alphabet), num_outputs, conv_layers=convs, fc_layers=layers,
    #                                    embed_dim=embed_dim, filter_widths=width)
    print("Building new model with extra layer")
    convs = []
    dense = []
    for layer in model.layers:
        if 'convolution' in layer.name:
            convs.append(layer)
        if 'dense' in layer.name:
            dense.append(layer)

    filters = [x.filter_length for x in convs]
    nb_filters = (convs[0].nb_filter, )
    fc_widths = [x.output_dim for x in dense]
    #fc_widths.append(fc_widths[-1] //2)
    fc_widths.append(fc_widths[-1])

    new_model = nn_models.get_cnn_model(model.layers[0].input_shape,
                                        model.layers[1].input_dim,
                                        model.layers[-1].output_dim,
                                        conv_layers=nb_filters,
                                        fc_layers=fc_widths,
                                        embed_dim=model.layers[1].output_dim,
                                        filter_widths=filters)

    ## Just so i don't accidentally try to refer to this later
    del model

    ## Change the name of the output layer so that we don't try to read those weights in -- we will have a different number of parameters:
    #new_model.layers[-1].name = "NewOutput"

    ## Load as many weights as possible taking advantage of consistently named layers:
    new_model.load_weights(os.path.join(working_dir, "model.h5"), by_name=True)

    ## Re-load data and retrain model:
    print("Reading data...")
    Y, label_alphabet, X_array, feature_alphabet = ctk_io.read_token_sequence_data(
        working_dir)

    Y_array = np.array(Y)
    #print("Shape of X is %s and Y is %s" % (str(X.shape), str(Y.shape)))

    num_examples, dimension = X_array.shape
    num_outputs = 1 if len(label_alphabet) == 2 else len(label_alphabet)
    num_y_examples = len(Y)

    Y_adj, indices = ctk_io.flatten_outputs(Y_array)
    out_counts = Y_adj.sum(0)

    stopper = nn_models.get_early_stopper()

    print("Retraining model")
    new_model.fit(X_array,
                  Y_adj,
                  nb_epoch=nb_epoch,
                  batch_size=batch_size,
                  verbose=1,
                  validation_split=0.2)  #,
    #callbacks=[stopper]) #,
    #class_weight=class_weights)

    new_model.summary()

    new_model.save(os.path.join(working_dir, 'new_model.h5'), overwrite=True)

    with ZipFile(os.path.join(working_dir, 'extended.model'), 'w') as myzip:
        myzip.write(os.path.join(working_dir, 'new_model.h5'), 'model.h5')
        myzip.write(os.path.join(working_dir, 'alphabets.pkl'),
                    'alphabets.pkl')
示例#12
0
def main(args):
    if len(args) < 1:
        sys.stderr.write("Error - one required argument: <data directory>\n")
        sys.exit(-1)

    working_dir = args[0]

    print("Reading data...")
    Y, outcome_map, outcome_list, X, feature_alphabet = ctk_io.read_multitask_token_sequence_data(
        working_dir)
    start_ind = feature_alphabet[start_symbol]
    end_ind = feature_alphabet[end_symbol]

    train_x, valid_x, train_y, valid_y = train_test_split(X,
                                                          Y,
                                                          test_size=0.2,
                                                          random_state=7)

    #    X_distance = get_distance_features(X, start_ind, end_ind)

    print("Shape of X is %s and Y is %s" % (str(X.shape), str(Y.shape)))

    num_examples, dimension = X.shape
    num_y_examples, num_labels = Y.shape
    assert num_examples == num_y_examples

    weights = None
    if len(args) > 1:
        weights = ctk_io.read_embeddings(args[1], feats_alphabet)

    train_y_adj, train_indices = ctk_io.flatten_outputs(train_y)
    valid_y_adj, valid_indices = ctk_io.flatten_outputs(valid_y)
    if not train_indices == valid_indices:
        print(
            "Error: training and valid sets have different index sets -- may be missing some labels in one set or the other"
        )
        sys.exit(-1)

    output_dims_list = []
    train_y_list = []
    valid_y_list = []
    indices = train_indices
    for i in range(len(indices) - 1):
        label_dims = indices[i + 1] - indices[i]
        output_dims_list.append(label_dims)
        if label_dims == 1:
            train_y_list.append(train_y_adj[:, indices[i]])
            valid_y_list.append(valid_y_adj[:, indices[i]])
        else:
            train_y_list.append(train_y_adj[:, indices[i]:indices[i + 1]])
            valid_y_list.append(valid_y_adj[:, indices[i]:indices[i + 1]])

        print("Dimensions of label %d are %s" %
              (i, str(train_y_list[-1].shape)))

    ## pass a function to the search that it uses to get a random config
    ## and a function that it will get an eval given (e)pochs and (c)onfig file:
    optim = RandomSearch(
        lambda: get_random_config(weights), lambda e, c: run_one_eval(
            e, c, train_x, train_y_list, valid_x, valid_y_list,
            len(feature_alphabet), output_dims_list, weights))
    best_config = optim.optimize(max_iter=27)

    open(os.path.join(working_dir, 'model_0.config'),
         'w').write(str(best_config))
    print("Best config returned by optimizer is %s" % str(best_config))
def main(args):
    if len(args) < 1:
        sys.stderr.write("Error - one required argument: <data directory>\n")
        sys.exit(-1)

    working_dir = args[0]
    
    print("Reading data...")
    Y, outcome_map, outcome_list, X, feature_alphabet = ctk_io.read_multitask_token_sequence_data(working_dir) # ('data_testing/multitask_assertion/train_and_test') 
    
    print("Shape of X is %s and Y is %s" % (str(X.shape), str(Y.shape)))
    
    num_examples, dimension = X.shape
    num_y_examples, num_labels = Y.shape
    assert num_examples == num_y_examples
    
    #print("Data has %d examples and dimension %d" % (num_examples, dimension) )
    #print("Output has %d dimensions" % (num_labels) )

    #X = np.reshape(X, (num_examples, 11, dimension / 11))
    
    Y_adj, indices = ctk_io.flatten_outputs(Y)
    stopper = nn_models.get_early_stopper()
    
    output_dims_list = []
    y_list = []
    
    for i in range(len(indices)-1):
        label_dims = indices[i+1] - indices[i]
        output_dims_list.append(label_dims)
        if label_dims == 1:
            y_list.append(Y_adj[:, indices[i]])
        else:
            y_list.append(Y_adj[:, indices[i]:indices[i+1]])
        
        print("Dimensions of label %d are %s" % (i, str(y_list[-1].shape) ) )

    model = nn_models.get_multitask_cnn(X.shape, len(feature_alphabet), output_dims_list, conv_layers=filters, fc_layers=layers, 
                                        embed_dim=embed_dim, filter_widths=widths)
    #model = nn_models.get_multitask_mlp(X.shape, len(feature_alphabet), output_dims_list, fc_layers=layers, embed_dim=embed_dim)
    
    model.fit(X, y_list,
                  nb_epoch=nb_epoch,
                  batch_size=batch_size,
                  verbose=1,
                  validation_split=0.2,
                  callbacks=[stopper])
                  
    model.summary()
    
    json_string = model.to_json()
    open(os.path.join(working_dir, 'model_0.json'), 'w').write(json_string)
    model.save_weights(os.path.join(working_dir, 'model_0.h5'), overwrite=True)
    
    #script_dir = os.path.dirname(os.path.realpath(sys.argv[0]))
    fn = open(os.path.join(working_dir, 'alphabets.pkl'), 'w')
    pickle.dump( (feature_alphabet, outcome_map, outcome_list), fn)
    fn.close()
    
    with ZipFile(os.path.join(working_dir, 'script.model'), 'w') as myzip:
        myzip.write(os.path.join(working_dir, 'model_0.json'), 'model_0.json')
        myzip.write(os.path.join(working_dir, 'model_0.h5'), 'model_0.h5')
        myzip.write(os.path.join(working_dir, 'alphabets.pkl'), 'alphabets.pkl')