def main(args): if len(args) < 1: sys.stderr.write("Error - one required argument: <data directory>\n") sys.exit(-1) working_dir = args[0] print("Reading data...") Y, outcome_map, outcome_list, X, feature_alphabet = ctk_io.read_multitask_token_sequence_data(working_dir) start_ind = feature_alphabet[start_symbol] end_ind = feature_alphabet[end_symbol] train_x, valid_x, train_y, valid_y = train_test_split(X, Y, test_size=0.2, random_state=7) # X_distance = get_distance_features(X, start_ind, end_ind) print("Shape of X is %s and Y is %s" % (str(X.shape), str(Y.shape))) num_examples, dimension = X.shape num_y_examples, num_labels = Y.shape assert num_examples == num_y_examples weights = None if len(args) > 1: weights = ctk_io.read_embeddings(args[1], feats_alphabet) train_y_adj, train_indices = ctk_io.flatten_outputs(train_y) valid_y_adj, valid_indices = ctk_io.flatten_outputs(valid_y) if not train_indices == valid_indices: print("Error: training and valid sets have different index sets -- may be missing some labels in one set or the other") sys.exit(-1) output_dims_list = [] train_y_list = [] valid_y_list = [] indices = train_indices for i in range(len(indices)-1): label_dims = indices[i+1] - indices[i] output_dims_list.append(label_dims) if label_dims == 1: train_y_list.append(train_y_adj[:, indices[i]]) valid_y_list.append(valid_y_adj[:, indices[i]]) else: train_y_list.append(train_y_adj[:, indices[i]:indices[i+1]]) valid_y_list.append(valid_y_adj[:, indices[i]:indices[i+1]]) print("Dimensions of label %d are %s" % (i, str(train_y_list[-1].shape) ) ) ## pass a function to the search that it uses to get a random config ## and a function that it will get an eval given (e)pochs and (c)onfig file: optim = RandomSearch(lambda: get_random_config(weights), lambda e, c: run_one_eval(e, c, train_x, train_y_list, valid_x, valid_y_list, len(feature_alphabet), output_dims_list, weights ) ) best_config = optim.optimize(max_iter=27) open(os.path.join(working_dir, 'model_0.config'), 'w').write( str(best_config) ) print("Best config returned by optimizer is %s" % str(best_config) )
def main(args): if len(args) < 1: sys.stderr.write("Error - one required argument: <data directory>\n") sys.exit(-1) working_dir = args[0] # print("Reading data...") Y, X = ctk_io.read_multitask_liblinear( working_dir) # ('data_testing/multitask_assertion/train_and_test') stopper = nn_models.get_early_stopper() num_examples, dimension = X.shape num_y_examples, num_labels = Y.shape assert num_examples == num_y_examples #print("Data has %d examples and dimension %d" % (num_examples, dimension) ) #print("Output has %d dimensions" % (num_labels) ) X = np.reshape(X, (num_examples, 11, dimension / 11)) Y_adj, indices = ctk_io.flatten_outputs(Y) #print("After reshaping the data has shape %s" % (str(X.shape))) for label_ind in range(0, Y.shape[1]): num_outputs = indices[label_ind + 1] - indices[label_ind] model = nn_models.get_cnn_model(X.shape, num_outputs) #print("For label ind %d, grabbing indices from %d to %d" % (label_ind, int(indices[label_ind]), int(indices[label_ind+1]))) train_y = Y_adj[:, int(indices[label_ind]):int(indices[label_ind + 1])] #if(train_y.shape[-1] == 1): # print("Number of values=1 is %d" % (train_y.sum())) #print("Shape of y is %s, shape of X is %s, max value in y is %f and min is %f" % (str(train_y.shape), str(X.shape), train_y.max(), train_y.min()) ) model.fit(X, train_y, nb_epoch=nb_epoch, batch_size=batch_size, verbose=1, validation_split=0.2, callbacks=[stopper]) model.summary() json_string = model.to_json() open(os.path.join(working_dir, 'model_%d.json' % label_ind), 'w').write(json_string) model.save_weights(os.path.join(working_dir, 'model_%d.h5' % label_ind), overwrite=True) #print("This model has %d layers and layer 3 has %d weights" % (len(model.layers), len(model.layers[3].get_weights()) ) ) #print("The weight of the first layer at index 50 is %f" % model.layers[3].get_weights()[50]) sys.exit(0)
def main(args): if len(args) < 1: sys.stderr.write("Error - one required arguments: <data directory>\n") sys.exit(-1) working_dir = args[0] print("Reading data...") Y, label_alphabet, X_array, feature_alphabet = ctk_io.read_token_sequence_data(working_dir) X_segments, dimensions = split_entity_data(X_array, feature_alphabet) Y_array = np.array(Y) Y_adj, indices = ctk_io.flatten_outputs(Y_array) num_outputs = 1 if len(label_alphabet) == 2 else len(label_alphabet) num_y_examples = len(Y) train_x0, valid_x0, train_x1, valid_x1, train_x2, valid_x2, train_y, valid_y = train_test_split(X_segments[0], X_segments[1], X_segments[2], Y_array, test_size=0.2, random_state=18) train_x = [train_x0, train_x1, train_x2] valid_x = [valid_x0, valid_x1, valid_x2] optim = RandomSearch(lambda: get_random_config(), lambda x, y: run_one_eval(x, y, train_x, train_y, valid_x, valid_y, len(feature_alphabet), num_outputs ) ) best_config = optim.optimize() print("Best config: %s" % best_config)
def main(args): if len(args) < 1: sys.stderr.write("Error - one required arguments: <data directory>\n") sys.exit(-1) working_dir = args[0] print("Reading data...") Y, label_alphabet, X_array, feature_alphabet = ctk_io.read_token_sequence_data( working_dir) Y_array = np.array(Y) #print("Shape of X is %s and Y is %s" % (str(X.shape), str(Y.shape))) num_examples, dimension = X_array.shape num_outputs = 1 if len(label_alphabet) == 2 else len(label_alphabet) num_y_examples = len(Y) assert num_examples == num_y_examples Y_adj, indices = ctk_io.flatten_outputs(Y_array) train_x, valid_x, train_y, valid_y = train_test_split(X_array, Y_array, test_size=0.2, random_state=18) optim = RandomSearch( lambda: get_random_config(), lambda x, y: run_one_eval(x, y, train_x, train_y, valid_x, valid_y, len(feature_alphabet), num_outputs)) best_config = optim.optimize() print("Best config: %s" % best_config)
def main(args): if len(args) < 1: sys.stderr.write("Error - one required arguments: <data directory>\n") sys.exit(-1) working_dir = args[0] print("Reading data...") Y, label_alphabet, X_array, feature_alphabet = ctk_io.read_token_sequence_data(working_dir) Y_array = np.array(Y) # print("Shape of X is %s and Y is %s" % (str(X.shape), str(Y.shape))) num_examples, dimension = X_array.shape num_outputs = 1 if len(label_alphabet) == 2 else len(label_alphabet) num_y_examples = len(Y) assert num_examples == num_y_examples Y_adj, indices = ctk_io.flatten_outputs(Y_array) train_x, valid_x, train_y, valid_y = train_test_split(X_array, Y_array, test_size=0.2, random_state=18) optim = RandomSearch( lambda: get_random_config(), lambda x, y: run_one_eval(x, y, train_x, train_y, valid_x, valid_y, len(feature_alphabet), num_outputs), ) best_config = optim.optimize() print("Best config: %s" % best_config)
def main(args): if len(args) < 1: sys.stderr.write("Error - one required arguments: <data directory>\n") sys.exit(-1) working_dir = args[0] print("Reading data...") Y, label_alphabet, X_array, feature_alphabet = ctk_io.read_token_sequence_data( working_dir) X_segments, dimensions = split_entity_data(X_array, feature_alphabet) Y_array = np.array(Y) Y_adj, indices = ctk_io.flatten_outputs(Y_array) num_outputs = 1 if len(label_alphabet) == 2 else len(label_alphabet) num_y_examples = len(Y) train_x0, valid_x0, train_x1, valid_x1, train_x2, valid_x2, train_y, valid_y = train_test_split( X_segments[0], X_segments[1], X_segments[2], Y_array, test_size=0.2, random_state=18) train_x = [train_x0, train_x1, train_x2] valid_x = [valid_x0, valid_x1, valid_x2] optim = RandomSearch( lambda: get_random_config(), lambda x, y: run_one_eval(x, y, train_x, train_y, valid_x, valid_y, len(feature_alphabet), num_outputs)) best_config = optim.optimize() print("Best config: %s" % best_config)
def main(args): if len(args) < 1: sys.stderr.write("Error - one required argument: <data directory>\n") sys.exit(-1) working_dir = args[0] # print("Reading data...") Y, X = ctk_io.read_multitask_liblinear(working_dir) # ('data_testing/multitask_assertion/train_and_test') stopper = nn_models.get_early_stopper() num_examples, dimension = X.shape num_y_examples, num_labels = Y.shape assert num_examples == num_y_examples #print("Data has %d examples and dimension %d" % (num_examples, dimension) ) #print("Output has %d dimensions" % (num_labels) ) X = np.reshape(X, (num_examples, 11, dimension / 11)) Y_adj, indices = ctk_io.flatten_outputs(Y) #print("After reshaping the data has shape %s" % (str(X.shape))) for label_ind in range(0, Y.shape[1]): num_outputs = indices[label_ind+1] - indices[label_ind] model = nn_models.get_cnn_model(X.shape, num_outputs) #print("For label ind %d, grabbing indices from %d to %d" % (label_ind, int(indices[label_ind]), int(indices[label_ind+1]))) train_y = Y_adj[:, int(indices[label_ind]):int(indices[label_ind+1])] #if(train_y.shape[-1] == 1): # print("Number of values=1 is %d" % (train_y.sum())) #print("Shape of y is %s, shape of X is %s, max value in y is %f and min is %f" % (str(train_y.shape), str(X.shape), train_y.max(), train_y.min()) ) model.fit(X, train_y, nb_epoch=nb_epoch, batch_size=batch_size, verbose=1, validation_split=0.2, callbacks=[stopper]) model.summary() json_string = model.to_json() open(os.path.join(working_dir, 'model_%d.json' % label_ind), 'w').write(json_string) model.save_weights(os.path.join(working_dir, 'model_%d.h5' % label_ind), overwrite=True) #print("This model has %d layers and layer 3 has %d weights" % (len(model.layers), len(model.layers[3].get_weights()) ) ) #print("The weight of the first layer at index 50 is %f" % model.layers[3].get_weights()[50]) sys.exit(0)
def main(args): if len(args) < 1: sys.stderr.write("Error - one required argument: <working_dir>\n") sys.exit(-1) working_dir = args[0] ### Extract existing model: print("Extracting existing model") with ZipFile(os.path.join(working_dir, 'script.model'), 'r') as myzip: myzip.extract('model.h5', working_dir) myzip.extract('alphabets.pkl', working_dir) (feature_alphabet, label_alphabet) = pickle.load( open(os.path.join(working_dir, 'alphabets.pkl'), 'r' ) ) label_lookup = {val:key for (key,val) in label_alphabet.iteritems()} model = load_model(os.path.join(working_dir, "model.h5")) #config = model.get_config() #model = Container.from_config(config) ## Find the model params needed by CNN method and get a cnn with one extra FC layer: # nn_models.get_cnn_model(X_array.shape, len(feature_alphabet), num_outputs, conv_layers=convs, fc_layers=layers, # embed_dim=embed_dim, filter_widths=width) print("Building new model with extra layer") convs = [] dense = [] for layer in model.layers: if 'convolution' in layer.name: convs.append(layer) if 'dense' in layer.name: dense.append(layer) filters = [x.filter_length for x in convs] nb_filters = (convs[0].nb_filter,) fc_widths = [x.output_dim for x in dense] fc_widths.append(fc_widths[-1] //2) new_model = nn_models.get_cnn_model(model.layers[0].input_shape, model.layers[1].input_dim, model.layers[-1].output_dim, conv_layers=nb_filters, fc_layers=fc_widths, embed_dim=model.layers[1].output_dim, filter_widths=filters ) ## Just so i don't accidentally try to refer to this later del model ## Change the name of the output layer so that we don't try to read those weights in -- we will have a different number of parameters: #new_model.layers[-1].name = "NewOutput" ## Load as many weights as possible taking advantage of consistently named layers: new_model.load_weights(os.path.join(working_dir, "model.h5"), by_name=True) ## Re-load data and retrain model: print("Reading data...") Y, label_alphabet, X_array, feature_alphabet = ctk_io.read_token_sequence_data(working_dir) Y_array = np.array(Y) #print("Shape of X is %s and Y is %s" % (str(X.shape), str(Y.shape))) num_examples, dimension = X_array.shape num_outputs = 1 if len(label_alphabet) == 2 else len(label_alphabet) num_y_examples = len(Y) Y_adj, indices = ctk_io.flatten_outputs(Y_array) out_counts = Y_adj.sum(0) stopper = nn_models.get_early_stopper() print("Retraining model") new_model.fit(X_array, Y_adj, nb_epoch=nb_epoch, batch_size=batch_size, verbose=1, validation_split=0.2) #, #callbacks=[stopper]) #, #class_weight=class_weights) new_model.summary() new_model.save(os.path.join(working_dir, 'new_model.h5'), overwrite=True) with ZipFile(os.path.join(working_dir, 'extended.model'), 'w') as myzip: myzip.write(os.path.join(working_dir, 'new_model.h5'), 'model.h5') myzip.write(os.path.join(working_dir, 'alphabets.pkl'), 'alphabets.pkl')
def main(args): working_dir = args[0] print("Reading data...") Y, X = ctk_io.read_multitask_liblinear(working_dir) # get_data() num_examples, dimension = X.shape num_y_examples, num_labels = Y.shape assert num_examples == num_y_examples print("Data has %d examples and dimension %d" % (num_examples, dimension)) print("Output has %d dimensions" % (num_labels)) Y_adj, indices = ctk_io.flatten_outputs(Y) print("%d labels mapped to %d outputs based on category numbers" % (Y.shape[1], Y_adj.shape[1])) label_scores = [] for label_ind in range(0, Y.shape[1]): num_outputs = indices[label_ind + 1] - indices[label_ind] # model = models.get_mlp_model(dimension, num_outputs) print("Starting to train for label %d with %d outputs" % (label_ind, num_outputs)) folds = sk.cross_validation.KFold(num_examples, n_folds=num_folds) scores = [] total_tp = 0 total_fp = 0 total_fn = 0 fold_ind = 0 total_score = 0 for train_indices, test_indices in folds: print("Starting fold %d" % fold_ind) train_x = X[train_indices] train_y = Y_adj[train_indices, int(indices[label_ind]) : int(indices[label_ind + 1])] test_x = X[test_indices] test_y = Y_adj[test_indices, int(indices[label_ind]) : int(indices[label_ind + 1])] model = nn_models.get_mlp_model(dimension, num_outputs) model.fit(train_x, train_y, nb_epoch=nb_epoch, batch_size=batch_size) ### This was to test model reading/writing and it works fine. # temp_dir = tempfile.mkdtemp() # json_string = model.to_json() # open(os.path.join(temp_dir, 'model_%d.json' % label_ind), 'w').write(json_string) # model.save_weights(os.path.join(temp_dir, 'model_%d.h5' % label_ind), overwrite=True) # # model = None # # model = model_from_json(open(os.path.join(temp_dir, "model_%d.json" % label_ind)).read()) # model.load_weights(os.path.join(temp_dir, "model_%d.h5" % label_ind)) if num_outputs == 1: labels = test_y predictions = model.predict_classes(test_x, batch_size=batch_size) # labels = np.reshape(test_y, (len(test_y),1)) ## count up true positive occurrences where prediction = label = 1 aka prediction + label == 2 tp = len(np.where((predictions + labels) == 2)[0]) total_tp += tp ## false positives: prediction - label = 1 fp = len(np.where((predictions - labels) == 1)[0]) total_fp += fp ## false negatives: label - prediction = 1 fn = len(np.where((labels - predictions) == 1)[0]) total_fn += fn print("tp=%d, fp=%d, fn=%d" % (tp, fp, fn)) recall = tp / float(tp + fn) if tp > 0 else 0 precision = tp / float(tp + fp) if tp > 0 else 1 f1 = get_f(recall, precision) print("P=%f, R=%f, F1=%f" % (precision, recall, f1)) else: score = model.evaluate(test_x, test_y, batch_size=batch_size) print("score=%s" % (score)) total_score += score[1] # score = model.evaluate(test_x, test_y, show_accuracy=True, batch_size=batch_size) # print("Scores for fold %d:" % fold_ind) # print("test score: ", score[0]) # print("test accuracy: " , score[1]) fold_ind += 1 if num_outputs == 1: recall = total_tp / float(total_tp + total_fn) precision = total_tp / float(total_tp + total_fp) f1 = get_f(recall, precision) print("Overall total: P=%f, R=%f, F=%f" % (recall, precision, f1)) label_scores.append(f1) else: total_score /= num_folds print("Overall accuracy = %f" % (total_score)) label_scores.append(total_score) for ind, val in enumerate(label_scores): print("%s of label %d is %f" % ("Fscore" if num_outputs == 2 else "Accuracy", ind, val))
def main(args): working_dir = args[0] print("Reading data...") Y, X = ctk_io.read_multitask_liblinear(working_dir) # get_data() num_examples, dimension = X.shape num_y_examples, num_labels = Y.shape assert num_examples == num_y_examples print("Data has %d examples and dimension %d" % (num_examples, dimension)) print("Output has %d dimensions" % (num_labels)) Y_adj, indices = ctk_io.flatten_outputs(Y) print("%d labels mapped to %d outputs based on category numbers" % (Y.shape[1], Y_adj.shape[1])) label_scores = [] for label_ind in range(0, Y.shape[1]): num_outputs = indices[label_ind + 1] - indices[label_ind] # model = models.get_mlp_model(dimension, num_outputs) print("Starting to train for label %d with %d outputs" % (label_ind, num_outputs)) folds = sk.cross_validation.KFold(num_examples, n_folds=num_folds) scores = [] total_tp = 0 total_fp = 0 total_fn = 0 fold_ind = 0 total_score = 0 for train_indices, test_indices in folds: print("Starting fold %d" % fold_ind) train_x = X[train_indices] train_y = Y_adj[train_indices, int(indices[label_ind]):int(indices[label_ind + 1])] test_x = X[test_indices] test_y = Y_adj[test_indices, int(indices[label_ind]):int(indices[label_ind + 1])] model = nn_models.get_mlp_model(dimension, num_outputs) model.fit(train_x, train_y, nb_epoch=nb_epoch, batch_size=batch_size) ### This was to test model reading/writing and it works fine. # temp_dir = tempfile.mkdtemp() # json_string = model.to_json() # open(os.path.join(temp_dir, 'model_%d.json' % label_ind), 'w').write(json_string) # model.save_weights(os.path.join(temp_dir, 'model_%d.h5' % label_ind), overwrite=True) # # model = None # # model = model_from_json(open(os.path.join(temp_dir, "model_%d.json" % label_ind)).read()) # model.load_weights(os.path.join(temp_dir, "model_%d.h5" % label_ind)) if num_outputs == 1: labels = test_y predictions = model.predict_classes(test_x, batch_size=batch_size) # labels = np.reshape(test_y, (len(test_y),1)) ## count up true positive occurrences where prediction = label = 1 aka prediction + label == 2 tp = len(np.where((predictions + labels) == 2)[0]) total_tp += tp ## false positives: prediction - label = 1 fp = len(np.where((predictions - labels) == 1)[0]) total_fp += fp ## false negatives: label - prediction = 1 fn = len(np.where((labels - predictions) == 1)[0]) total_fn += fn print("tp=%d, fp=%d, fn=%d" % (tp, fp, fn)) recall = tp / float(tp + fn) if tp > 0 else 0 precision = tp / float(tp + fp) if tp > 0 else 1 f1 = get_f(recall, precision) print("P=%f, R=%f, F1=%f" % (precision, recall, f1)) else: score = model.evaluate(test_x, test_y, batch_size=batch_size) print("score=%s" % (score)) total_score += score[1] # score = model.evaluate(test_x, test_y, show_accuracy=True, batch_size=batch_size) # print("Scores for fold %d:" % fold_ind) # print("test score: ", score[0]) # print("test accuracy: " , score[1]) fold_ind += 1 if num_outputs == 1: recall = total_tp / float(total_tp + total_fn) precision = total_tp / float(total_tp + total_fp) f1 = get_f(recall, precision) print("Overall total: P=%f, R=%f, F=%f" % (recall, precision, f1)) label_scores.append(f1) else: total_score /= num_folds print("Overall accuracy = %f" % (total_score)) label_scores.append(total_score) for ind, val in enumerate(label_scores): print("%s of label %d is %f" % ("Fscore" if num_outputs == 2 else "Accuracy", ind, val))
def main(args): if len(args) < 1: sys.stderr.write("Error - one required argument: <working_dir>\n") sys.exit(-1) working_dir = args[0] ### Extract existing model: print("Extracting existing model") with ZipFile(os.path.join(working_dir, 'script.model'), 'r') as myzip: myzip.extract('model.h5', working_dir) myzip.extract('alphabets.pkl', working_dir) (feature_alphabet, label_alphabet) = pickle.load( open(os.path.join(working_dir, 'alphabets.pkl'), 'r')) label_lookup = {val: key for (key, val) in label_alphabet.iteritems()} model = load_model(os.path.join(working_dir, "model.h5")) #config = model.get_config() #model = Container.from_config(config) ## Find the model params needed by CNN method and get a cnn with one extra FC layer: # nn_models.get_cnn_model(X_array.shape, len(feature_alphabet), num_outputs, conv_layers=convs, fc_layers=layers, # embed_dim=embed_dim, filter_widths=width) print("Building new model with extra layer") convs = [] dense = [] for layer in model.layers: if 'convolution' in layer.name: convs.append(layer) if 'dense' in layer.name: dense.append(layer) filters = [x.filter_length for x in convs] nb_filters = (convs[0].nb_filter, ) fc_widths = [x.output_dim for x in dense] #fc_widths.append(fc_widths[-1] //2) fc_widths.append(fc_widths[-1]) new_model = nn_models.get_cnn_model(model.layers[0].input_shape, model.layers[1].input_dim, model.layers[-1].output_dim, conv_layers=nb_filters, fc_layers=fc_widths, embed_dim=model.layers[1].output_dim, filter_widths=filters) ## Just so i don't accidentally try to refer to this later del model ## Change the name of the output layer so that we don't try to read those weights in -- we will have a different number of parameters: #new_model.layers[-1].name = "NewOutput" ## Load as many weights as possible taking advantage of consistently named layers: new_model.load_weights(os.path.join(working_dir, "model.h5"), by_name=True) ## Re-load data and retrain model: print("Reading data...") Y, label_alphabet, X_array, feature_alphabet = ctk_io.read_token_sequence_data( working_dir) Y_array = np.array(Y) #print("Shape of X is %s and Y is %s" % (str(X.shape), str(Y.shape))) num_examples, dimension = X_array.shape num_outputs = 1 if len(label_alphabet) == 2 else len(label_alphabet) num_y_examples = len(Y) Y_adj, indices = ctk_io.flatten_outputs(Y_array) out_counts = Y_adj.sum(0) stopper = nn_models.get_early_stopper() print("Retraining model") new_model.fit(X_array, Y_adj, nb_epoch=nb_epoch, batch_size=batch_size, verbose=1, validation_split=0.2) #, #callbacks=[stopper]) #, #class_weight=class_weights) new_model.summary() new_model.save(os.path.join(working_dir, 'new_model.h5'), overwrite=True) with ZipFile(os.path.join(working_dir, 'extended.model'), 'w') as myzip: myzip.write(os.path.join(working_dir, 'new_model.h5'), 'model.h5') myzip.write(os.path.join(working_dir, 'alphabets.pkl'), 'alphabets.pkl')
def main(args): if len(args) < 1: sys.stderr.write("Error - one required argument: <data directory>\n") sys.exit(-1) working_dir = args[0] print("Reading data...") Y, outcome_map, outcome_list, X, feature_alphabet = ctk_io.read_multitask_token_sequence_data( working_dir) start_ind = feature_alphabet[start_symbol] end_ind = feature_alphabet[end_symbol] train_x, valid_x, train_y, valid_y = train_test_split(X, Y, test_size=0.2, random_state=7) # X_distance = get_distance_features(X, start_ind, end_ind) print("Shape of X is %s and Y is %s" % (str(X.shape), str(Y.shape))) num_examples, dimension = X.shape num_y_examples, num_labels = Y.shape assert num_examples == num_y_examples weights = None if len(args) > 1: weights = ctk_io.read_embeddings(args[1], feats_alphabet) train_y_adj, train_indices = ctk_io.flatten_outputs(train_y) valid_y_adj, valid_indices = ctk_io.flatten_outputs(valid_y) if not train_indices == valid_indices: print( "Error: training and valid sets have different index sets -- may be missing some labels in one set or the other" ) sys.exit(-1) output_dims_list = [] train_y_list = [] valid_y_list = [] indices = train_indices for i in range(len(indices) - 1): label_dims = indices[i + 1] - indices[i] output_dims_list.append(label_dims) if label_dims == 1: train_y_list.append(train_y_adj[:, indices[i]]) valid_y_list.append(valid_y_adj[:, indices[i]]) else: train_y_list.append(train_y_adj[:, indices[i]:indices[i + 1]]) valid_y_list.append(valid_y_adj[:, indices[i]:indices[i + 1]]) print("Dimensions of label %d are %s" % (i, str(train_y_list[-1].shape))) ## pass a function to the search that it uses to get a random config ## and a function that it will get an eval given (e)pochs and (c)onfig file: optim = RandomSearch( lambda: get_random_config(weights), lambda e, c: run_one_eval( e, c, train_x, train_y_list, valid_x, valid_y_list, len(feature_alphabet), output_dims_list, weights)) best_config = optim.optimize(max_iter=27) open(os.path.join(working_dir, 'model_0.config'), 'w').write(str(best_config)) print("Best config returned by optimizer is %s" % str(best_config))
def main(args): if len(args) < 1: sys.stderr.write("Error - one required argument: <data directory>\n") sys.exit(-1) working_dir = args[0] print("Reading data...") Y, outcome_map, outcome_list, X, feature_alphabet = ctk_io.read_multitask_token_sequence_data(working_dir) # ('data_testing/multitask_assertion/train_and_test') print("Shape of X is %s and Y is %s" % (str(X.shape), str(Y.shape))) num_examples, dimension = X.shape num_y_examples, num_labels = Y.shape assert num_examples == num_y_examples #print("Data has %d examples and dimension %d" % (num_examples, dimension) ) #print("Output has %d dimensions" % (num_labels) ) #X = np.reshape(X, (num_examples, 11, dimension / 11)) Y_adj, indices = ctk_io.flatten_outputs(Y) stopper = nn_models.get_early_stopper() output_dims_list = [] y_list = [] for i in range(len(indices)-1): label_dims = indices[i+1] - indices[i] output_dims_list.append(label_dims) if label_dims == 1: y_list.append(Y_adj[:, indices[i]]) else: y_list.append(Y_adj[:, indices[i]:indices[i+1]]) print("Dimensions of label %d are %s" % (i, str(y_list[-1].shape) ) ) model = nn_models.get_multitask_cnn(X.shape, len(feature_alphabet), output_dims_list, conv_layers=filters, fc_layers=layers, embed_dim=embed_dim, filter_widths=widths) #model = nn_models.get_multitask_mlp(X.shape, len(feature_alphabet), output_dims_list, fc_layers=layers, embed_dim=embed_dim) model.fit(X, y_list, nb_epoch=nb_epoch, batch_size=batch_size, verbose=1, validation_split=0.2, callbacks=[stopper]) model.summary() json_string = model.to_json() open(os.path.join(working_dir, 'model_0.json'), 'w').write(json_string) model.save_weights(os.path.join(working_dir, 'model_0.h5'), overwrite=True) #script_dir = os.path.dirname(os.path.realpath(sys.argv[0])) fn = open(os.path.join(working_dir, 'alphabets.pkl'), 'w') pickle.dump( (feature_alphabet, outcome_map, outcome_list), fn) fn.close() with ZipFile(os.path.join(working_dir, 'script.model'), 'w') as myzip: myzip.write(os.path.join(working_dir, 'model_0.json'), 'model_0.json') myzip.write(os.path.join(working_dir, 'model_0.h5'), 'model_0.h5') myzip.write(os.path.join(working_dir, 'alphabets.pkl'), 'alphabets.pkl')