def main(): dataset = "bank-full" path = "datasets/" + dataset + "/all/" fname_data = path + dataset + "_all.X" fname_label = path + dataset + "_all.Y" data = load_data(fname_data) Y = load_labels(fname_label) # mine rules print "mining rules using FP-growth" minsupport = 10 max_predicates_per_ant = 2 X_pos,X_neg,nantecedents,antecedent_len,antecedent_set = \ mine_antecedents(data,Y,minsupport,max_predicates_per_ant) n = len(data) # learn a falling rule list from the training data # set the parameters of Algorithm FRL w = 7 C = 0.000001 prob_terminate = 0.01 T_FRL = 3000 # set the parameters of Algorithm softFRL C1 = 0.5 T_softFRL = 6000 # set the parameter of the curiosity function lmda = 0.8 # train a falling rule list print "running algorithm FRL on bank-full" FRL_rule, FRL_prob, FRL_pos_cnt, FRL_neg_cnt, FRL_obj_per_rule, FRL_Ld, \ FRL_Ld_over_iters, FRL_Ld_best_over_iters = \ learn_FRL(X_pos, X_neg, n, w, C, prob_terminate, T_FRL, lmda) print "FRL learned:" display_rule_list(FRL_rule, FRL_prob, antecedent_set, FRL_pos_cnt, FRL_neg_cnt, FRL_obj_per_rule, FRL_Ld) print "running algorithm softFRL on bank-full" softFRL_rule, softFRL_prob, softFRL_pos_cnt, softFRL_neg_cnt, \ softFRL_pos_prop, softFRL_obj_per_rule, softFRL_Ld, \ softFRL_Ld_over_iters, softFRL_Ld_best_over_iters = \ learn_softFRL(X_pos, X_neg, n, w, C, C1, prob_terminate, T_softFRL, lmda) print "softFRL learned:" display_softFRL(softFRL_rule, softFRL_prob, antecedent_set, softFRL_pos_cnt, softFRL_neg_cnt, softFRL_pos_prop, softFRL_obj_per_rule, softFRL_Ld)
def main(argv): # test example generation from data import Token, ConllLoader, load_labels from label import Iob2TokenLabeler, LabelEncoder from transformers import AutoConfig, AutoTokenizer options = argparser().parse_args(argv[1:]) seq_len = options.max_seq_length word_labels = load_labels(options.labels) token_labeler = Iob2TokenLabeler(word_labels) # TODO add argument token_labels = token_labeler.labels() label_func = token_labeler.label_tokens label_encoder = LabelEncoder(token_labels, padding_label='O') # TODO encode_labels = label_encoder.encode config = AutoConfig.from_pretrained(options.model_name, cache_dir=options.cache_dir) tokenizer = AutoTokenizer.from_pretrained(options.model_name, config=config, cache_dir=options.cache_dir) tokenize_func = tokenizer.tokenize encode_tokens = lambda t: tokenizer.encode(t, add_special_tokens=False) document_loader = ConllLoader(tokenize_func, label_func) example_generator = WrapSentenceExampleGenerator( seq_len, Token(tokenizer.cls_token, is_special=True, masked=False), Token(tokenizer.sep_token, is_special=True, masked=False), Token(tokenizer.pad_token, is_special=True, masked=True), encode_tokens, encode_labels) for fn in options.conll_data: documents = list(document_loader.load(fn)) examples = list(example_generator.examples(documents)) for i, example in enumerate(examples): print(f'example {i}') print(example)
path_train = '/home/barbara/Documents/Trabalho/train-jpg/' new_path_train = '/home/barbara/Documents/Trabalho/train-jpg_resized/' path_label = '/home/barbara/Documents/Trabalho/train_v2.csv' path_tsne = 'tsne10' #Resizing images 32x32 #li.resize_images(path_train, new_path_train, 32) #Loading images start_time = time.time() data = dat.load_images(new_path_train) print 'Loaded in ' + str(time.time()-start_time) + 's' # Preprocessing data = data.astype('float32') data /= 255. data = pp.st_scale(data) data = pp.normalize_l2(data) data, i = pp.PCA_reduction(data, 0, 10) #Loading labels label = dat.load_labels(path_label) #Generate t-SNE start_time = time.time() t.generate_tsne(path_tsne, data, label) print 'Generated in ' + str(time.time()-start_time) + 's' #Generate a histogram of labels (?)
# limit tensorflow memory usage import tensorflow as tf from keras.backend.tensorflow_backend import set_session config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = 0.4 set_session(tf.Session(config=config)) # some hyperparameters (must conform to those in cnn.py) batch_size = 16 maxlen = 400 # file with only texts, no labels evaluation_test_file = "../data/C.txt" predictions_file = evaluation_test_file + ".fulltrain.pred" probabilities_file = evaluation_test_file + ".fulltrain.prob" model_file = "cnn_model_gpu_multifilter_fulltrain.hdf5" idx2label = load_labels(labels_file) model = load_model(model_file) X_test = load_test_file(evaluation_test_file, alphabet) X_test = sequence.pad_sequences(X_test, maxlen=maxlen) probabilities = model.predict(X_test, batch_size=batch_size) predictions = probabilities.argmax(axis=-1) write_predictions_to_file(evaluation_test_file, predictions_file, predictions, idx2label) write_probabilities_to_file(evaluation_test_file, probabilities_file, probabilities)
def main(argv): options = argparser().parse_args(argv[1:]) logger.info(f'train.py arguments: {options}') # word_labels are the labels assigned to words in the original # data, token_labeler.labels() the labels assigned to tokens in # the tokenized data. The two are differentiated to allow distinct # labels to be added e.g. to continuation wordpieces. word_labels = load_labels(options.labels) token_labeler = IobesTokenLabeler(word_labels) num_labels = len(token_labeler.labels()) label_encoder = LabelEncoder(token_labeler.labels()) logger.info(f'token labels: {token_labeler.labels()}') logger.info('loading pretrained model') pretrained_model, tokenizer, config = load_pretrained( options.model_name, cache_dir=options.cache_dir) logger.info('pretrained model config:') logger.info(config) if options.max_seq_length > config.max_position_embeddings: raise ValueError(f'--max_seq_length {options.max_seq_length} not ' f'supported by model') seq_len = options.max_seq_length encode_tokens = lambda t: tokenizer.encode(t, add_special_tokens=False) document_loader = ConllLoader(tokenizer.tokenize, token_labeler.label_tokens, options.separator) example_generator = EXAMPLE_GENERATORS[options.examples]( seq_len, Token(tokenizer.cls_token, is_special=True, masked=False), Token(tokenizer.sep_token, is_special=True, masked=False), Token(tokenizer.pad_token, is_special=True, masked=True), encode_tokens, label_encoder.encode) train_documents = document_loader.load(options.train_data) dev_documents = document_loader.load(options.dev_data) # containers instead of generators for statistics train_documents = list(train_documents) dev_documents = list(dev_documents) log_dataset_statistics('train', train_documents) log_dataset_statistics('dev', dev_documents) decoder = ViterbiDecoder(label_encoder.label_map) decoder.estimate_probabilities(train_documents) logger.info(f'init_prob:\n{decoder.init_prob}') logger.info(f'trans_prob:\n{decoder.trans_prob}') train_examples = example_generator.examples(train_documents) dev_examples = example_generator.examples(dev_documents) # containers instead of generators for len() and logging train_examples = list(train_examples) dev_examples = list(dev_examples) num_train_examples = len(train_examples) log_examples(train_examples, count=2) train_x, train_y = examples_to_inputs(train_examples) dev_x, dev_y = examples_to_inputs(dev_examples) ner_model = build_ner_model(pretrained_model, num_labels, seq_len) optimizer, lr_schedule = get_optimizer( options.lr, options.num_train_epochs, options.batch_size, options.warmup_proportion, num_train_examples, ) ner_model.compile( optimizer=optimizer, loss='sparse_categorical_crossentropy', sample_weight_mode='temporal', # TODO is this necessary? metrics=['sparse_categorical_accuracy']) logger.info('ner model:') ner_model.summary(print_fn=logger.info) lr_history = LRHistory(lr_schedule) history = ner_model.fit(train_x, train_y, epochs=options.num_train_epochs, batch_size=options.batch_size, validation_data=(dev_x, dev_y), callbacks=[lr_history]) for k, v in history.history.items(): logger.info(f'{k} history: {v}') logger.info(f'lr history: {lr_history.by_epoch}') dev_predictions = ner_model.predict(dev_x, verbose=1, batch_size=options.batch_size) assert len(dev_examples) == len(dev_predictions) for example, preds in zip(dev_examples, dev_predictions): assert len(example.tokens) == len(preds) for pos, (token, pred) in enumerate(zip(example.tokens, preds)): token.predictions.append((pos, pred)) documents = unique(t.document for e in dev_examples for t in e.tokens if not t.is_special) check_predictions(documents) for n, r in evaluate_assign_labels_funcs(documents, label_encoder).items(): print(f'{n}: prec {r.prec:.2%} rec {r.rec:.2%} f {r.fscore:.2%}') summarize_predictions = PREDICTION_SUMMARIZERS[options.summarize_preds] assign_labels = LABEL_ASSIGNERS[options.assign_labels] for document in documents: summarize_predictions(document) assign_labels(document, label_encoder) for n, r in evaluate_viterbi(documents, decoder.init_prob, decoder.trans_prob, label_encoder).items(): print(f'{n}: prec {r.prec:.2%} rec {r.rec:.2%} f {r.fscore:.2%}') for document in documents: assign_labels(document, label_encoder) # greedy print(conlleval_report(documents)) if options.output_file is not None: with open(options.output_file, 'w') as out: write_conll(documents, out=out) if options.ner_model_dir is not None: save_ner_model(options.ner_model_dir, ner_model, decoder, tokenizer, word_labels, config) return 0
checkpointer = ModelCheckpoint(filepath=model_file, verbose=1, save_best_only=True) tensorboard = TensorBoard( log_dir="./logs-multifilter-large-morehidden-moredrop-task1", write_graph=False) model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=nb_epoch, validation_data=(X_test, Y_test), callbacks=[stopping, checkpointer, tensorboard]) probabilities = model.predict(X_test, batch_size=batch_size) predictions = probabilities.argmax(axis=-1) idx2label = load_labels(task1_labels_file) #with open('cnn_predictions.txt', 'w') as g: # for i in xrange(len(y_test)): # g.write(' '.join([str(v) for v in X_test[i]]) + '\t' + idx2label.get(y_test[i], 'ERROR') + '\t' + idx2label.get(predictions[i], 'ERROR') + '\n') print('Performance of final model (not necessarily best model):') print('========================================================') cm = confusion_matrix(y_test, predictions) print('Confusion matrix:') print(cm) acc = accuracy_score(y_test, predictions) print('Accuracy score:') print(acc) labels = [label for (idx, label) in sorted(idx2label.items())] score_report = classification_report(y_test, predictions, target_names=labels) print('Score report:') print(score_report)
if arg in ("-w", "--write"): arg_write = True #initialize objects for further use model = model.Model() data = data.Data() x_train, y_train, x_test, y_test = data.get_dataset(params.DATASET_PATH, train=False, one_hot=True) n_objects_train, n_objects_test = data.get_amount_objects_per_category() if arg_multi: x_train, y_train, x_test, y_test = data.single_to_multi_view( x_train, y_train, x_test, y_test, params.N_VIEWS) labels = data.load_labels() _, _, _, group_ids, _, correct_predictions, _ = model.predict( x_test, y_test, get_saliency=False, get_activations=False) #split correct_predictions in lists of if classification is correct and classified label id is_correct = correct_predictions[:, 0] correct_label_ids = correct_predictions[:, 1] if params.DATASET_IS_SINGLELABEL: #if write argument is given create file and write results if arg_write: path = os.path.join(params.RESULTS_PATH, "models", os.path.basename(params.CKPT_PATH)) f = open(os.path.join(path, "stats.txt"), "w") f.write("Overall Accuracy: {:.3f}\n".format(np.mean(is_correct)))
import os import data filename = 'trainLabels.csv' # ROOT_PATH is the path of your python project. ROOT_PATH = "/Users/joverlyngaudillo/Desktop/diabetic-retinopathy" # data_directory is the full path of your dataset data_directory = os.path.join(ROOT_PATH, "INPUTDATA") # Fills the image_filenames and image_classes lists by # calling the function load_labels() from data.py image_filenames, image_classes = data.load_labels(filename) # Iterates over the INPUTDATA directory for f in os.listdir(data_directory): # Filters the content of the data_directory by choosing # files that with .jpeg format if f.endswith('.jpeg'): # file_names is a list paths of classes directories file_names = os.path.join(data_directory, f) img = f.replace('.jpeg', '') # Iterates over the image_filenames lists for name in image_filenames: if img == name: break
staircase=True) ######################################## ######################################## ##### ##### ##### Load & preprocess data ##### ##### ##### ######################################## ######################################## print('Loading processed data...') start = time.time() data_imgs = data.load_imgs(args.imgs_feat_path) data_text = np.array(data.load_text('pascal/train.mat')) labels_text = labels_imgs = data.load_labels( 'VOCdevkit/VOC2007/ImageSets/Main', 5011, 20) valid_idx = (np.sum(data_text, axis=1) != np.zeros(data_text.shape[0])) imgs_scaler = preprocessing.StandardScaler() text_scaler = preprocessing.StandardScaler() data_imgs = imgs_scaler.fit_transform(data_imgs[valid_idx]) data_text = text_scaler.fit_transform(data_text[valid_idx]) labels_text = labels_imgs = labels_text[valid_idx] IMG_SIZE = len(data_imgs[0]) TEXT_SIZE = len(data_text[0]) LABEL_SIZE = len(labels_text[0]) data = list(data_imgs) + list(data_text)