Exemplo n.º 1
0
def main():
    dataset = "bank-full"
    path = "datasets/" + dataset + "/all/"
    fname_data = path + dataset + "_all.X"
    fname_label = path + dataset + "_all.Y"
    
    data = load_data(fname_data)
    Y = load_labels(fname_label)
    
    # mine rules
    print "mining rules using FP-growth"
    minsupport = 10
    max_predicates_per_ant = 2
    X_pos,X_neg,nantecedents,antecedent_len,antecedent_set = \
        mine_antecedents(data,Y,minsupport,max_predicates_per_ant)
    
    n = len(data)
    
    # learn a falling rule list from the training data
    # set the parameters of Algorithm FRL
    w = 7
    C = 0.000001
    prob_terminate = 0.01
    T_FRL = 3000
    
    # set the parameters of Algorithm softFRL
    C1 = 0.5
    T_softFRL = 6000
    
    # set the parameter of the curiosity function
    lmda = 0.8
    
    # train a falling rule list
    print "running algorithm FRL on bank-full"
    FRL_rule, FRL_prob, FRL_pos_cnt, FRL_neg_cnt, FRL_obj_per_rule, FRL_Ld, \
        FRL_Ld_over_iters, FRL_Ld_best_over_iters = \
        learn_FRL(X_pos, X_neg, n, w, C, prob_terminate, T_FRL, lmda)
    
    print "FRL learned:"
    display_rule_list(FRL_rule, FRL_prob, antecedent_set, FRL_pos_cnt, FRL_neg_cnt,
                      FRL_obj_per_rule, FRL_Ld)
    
    print "running algorithm softFRL on bank-full"
    softFRL_rule, softFRL_prob, softFRL_pos_cnt, softFRL_neg_cnt, \
        softFRL_pos_prop, softFRL_obj_per_rule, softFRL_Ld, \
        softFRL_Ld_over_iters, softFRL_Ld_best_over_iters = \
        learn_softFRL(X_pos, X_neg, n, w, C, C1, prob_terminate,
                      T_softFRL, lmda)
    
    print "softFRL learned:"    
    display_softFRL(softFRL_rule, softFRL_prob, antecedent_set,
                    softFRL_pos_cnt, softFRL_neg_cnt, softFRL_pos_prop,
                    softFRL_obj_per_rule, softFRL_Ld)             
Exemplo n.º 2
0
def main(argv):
    # test example generation
    from data import Token, ConllLoader, load_labels
    from label import Iob2TokenLabeler, LabelEncoder
    from transformers import AutoConfig, AutoTokenizer

    options = argparser().parse_args(argv[1:])
    seq_len = options.max_seq_length

    word_labels = load_labels(options.labels)
    token_labeler = Iob2TokenLabeler(word_labels)  # TODO add argument
    token_labels = token_labeler.labels()
    label_func = token_labeler.label_tokens
    label_encoder = LabelEncoder(token_labels, padding_label='O')  # TODO
    encode_labels = label_encoder.encode

    config = AutoConfig.from_pretrained(options.model_name,
                                        cache_dir=options.cache_dir)
    tokenizer = AutoTokenizer.from_pretrained(options.model_name,
                                              config=config,
                                              cache_dir=options.cache_dir)
    tokenize_func = tokenizer.tokenize
    encode_tokens = lambda t: tokenizer.encode(t, add_special_tokens=False)

    document_loader = ConllLoader(tokenize_func, label_func)
    example_generator = WrapSentenceExampleGenerator(
        seq_len, Token(tokenizer.cls_token, is_special=True, masked=False),
        Token(tokenizer.sep_token, is_special=True, masked=False),
        Token(tokenizer.pad_token, is_special=True, masked=True),
        encode_tokens, encode_labels)

    for fn in options.conll_data:
        documents = list(document_loader.load(fn))
        examples = list(example_generator.examples(documents))
        for i, example in enumerate(examples):
            print(f'example {i}')
            print(example)
Exemplo n.º 3
0
path_train = '/home/barbara/Documents/Trabalho/train-jpg/'
new_path_train = '/home/barbara/Documents/Trabalho/train-jpg_resized/'
path_label = '/home/barbara/Documents/Trabalho/train_v2.csv'
path_tsne = 'tsne10'

#Resizing images 32x32
#li.resize_images(path_train, new_path_train, 32)

#Loading images
start_time = time.time()
data = dat.load_images(new_path_train)
print 'Loaded in ' + str(time.time()-start_time) + 's'

# Preprocessing
data = data.astype('float32')
data /= 255.  
data = pp.st_scale(data)
data = pp.normalize_l2(data)
data, i = pp.PCA_reduction(data, 0, 10)

#Loading labels
label = dat.load_labels(path_label)

#Generate t-SNE
start_time = time.time()
t.generate_tsne(path_tsne, data, label)
print 'Generated in ' + str(time.time()-start_time) + 's'

#Generate a histogram of labels (?)
# limit tensorflow memory usage
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.4
set_session(tf.Session(config=config))

# some hyperparameters (must conform to those in cnn.py)
batch_size = 16
maxlen = 400

# file with only texts, no labels
evaluation_test_file = "../data/C.txt"
predictions_file = evaluation_test_file + ".fulltrain.pred"
probabilities_file = evaluation_test_file + ".fulltrain.prob"
model_file = "cnn_model_gpu_multifilter_fulltrain.hdf5"
idx2label = load_labels(labels_file)

model = load_model(model_file)

X_test = load_test_file(evaluation_test_file, alphabet)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)

probabilities = model.predict(X_test, batch_size=batch_size)
predictions = probabilities.argmax(axis=-1)
write_predictions_to_file(evaluation_test_file, predictions_file, predictions,
                          idx2label)
write_probabilities_to_file(evaluation_test_file, probabilities_file,
                            probabilities)
Exemplo n.º 5
0
def main(argv):
    options = argparser().parse_args(argv[1:])
    logger.info(f'train.py arguments: {options}')

    # word_labels are the labels assigned to words in the original
    # data, token_labeler.labels() the labels assigned to tokens in
    # the tokenized data. The two are differentiated to allow distinct
    # labels to be added e.g. to continuation wordpieces.
    word_labels = load_labels(options.labels)
    token_labeler = IobesTokenLabeler(word_labels)
    num_labels = len(token_labeler.labels())
    label_encoder = LabelEncoder(token_labeler.labels())
    logger.info(f'token labels: {token_labeler.labels()}')

    logger.info('loading pretrained model')
    pretrained_model, tokenizer, config = load_pretrained(
        options.model_name, cache_dir=options.cache_dir)
    logger.info('pretrained model config:')
    logger.info(config)

    if options.max_seq_length > config.max_position_embeddings:
        raise ValueError(f'--max_seq_length {options.max_seq_length} not '
                         f'supported by model')
    seq_len = options.max_seq_length

    encode_tokens = lambda t: tokenizer.encode(t, add_special_tokens=False)

    document_loader = ConllLoader(tokenizer.tokenize,
                                  token_labeler.label_tokens,
                                  options.separator)

    example_generator = EXAMPLE_GENERATORS[options.examples](
        seq_len, Token(tokenizer.cls_token, is_special=True, masked=False),
        Token(tokenizer.sep_token, is_special=True, masked=False),
        Token(tokenizer.pad_token, is_special=True,
              masked=True), encode_tokens, label_encoder.encode)

    train_documents = document_loader.load(options.train_data)
    dev_documents = document_loader.load(options.dev_data)
    # containers instead of generators for statistics
    train_documents = list(train_documents)
    dev_documents = list(dev_documents)
    log_dataset_statistics('train', train_documents)
    log_dataset_statistics('dev', dev_documents)

    decoder = ViterbiDecoder(label_encoder.label_map)
    decoder.estimate_probabilities(train_documents)
    logger.info(f'init_prob:\n{decoder.init_prob}')
    logger.info(f'trans_prob:\n{decoder.trans_prob}')

    train_examples = example_generator.examples(train_documents)
    dev_examples = example_generator.examples(dev_documents)
    # containers instead of generators for len() and logging
    train_examples = list(train_examples)
    dev_examples = list(dev_examples)
    num_train_examples = len(train_examples)
    log_examples(train_examples, count=2)

    train_x, train_y = examples_to_inputs(train_examples)
    dev_x, dev_y = examples_to_inputs(dev_examples)

    ner_model = build_ner_model(pretrained_model, num_labels, seq_len)

    optimizer, lr_schedule = get_optimizer(
        options.lr,
        options.num_train_epochs,
        options.batch_size,
        options.warmup_proportion,
        num_train_examples,
    )

    ner_model.compile(
        optimizer=optimizer,
        loss='sparse_categorical_crossentropy',
        sample_weight_mode='temporal',  # TODO is this necessary?
        metrics=['sparse_categorical_accuracy'])
    logger.info('ner model:')
    ner_model.summary(print_fn=logger.info)

    lr_history = LRHistory(lr_schedule)
    history = ner_model.fit(train_x,
                            train_y,
                            epochs=options.num_train_epochs,
                            batch_size=options.batch_size,
                            validation_data=(dev_x, dev_y),
                            callbacks=[lr_history])
    for k, v in history.history.items():
        logger.info(f'{k} history: {v}')
    logger.info(f'lr history: {lr_history.by_epoch}')

    dev_predictions = ner_model.predict(dev_x,
                                        verbose=1,
                                        batch_size=options.batch_size)
    assert len(dev_examples) == len(dev_predictions)
    for example, preds in zip(dev_examples, dev_predictions):
        assert len(example.tokens) == len(preds)
        for pos, (token, pred) in enumerate(zip(example.tokens, preds)):
            token.predictions.append((pos, pred))

    documents = unique(t.document for e in dev_examples for t in e.tokens
                       if not t.is_special)
    check_predictions(documents)

    for n, r in evaluate_assign_labels_funcs(documents, label_encoder).items():
        print(f'{n}: prec {r.prec:.2%} rec {r.rec:.2%} f {r.fscore:.2%}')

    summarize_predictions = PREDICTION_SUMMARIZERS[options.summarize_preds]
    assign_labels = LABEL_ASSIGNERS[options.assign_labels]
    for document in documents:
        summarize_predictions(document)
        assign_labels(document, label_encoder)

    for n, r in evaluate_viterbi(documents, decoder.init_prob,
                                 decoder.trans_prob, label_encoder).items():
        print(f'{n}: prec {r.prec:.2%} rec {r.rec:.2%} f {r.fscore:.2%}')

    for document in documents:
        assign_labels(document, label_encoder)  # greedy

    print(conlleval_report(documents))

    if options.output_file is not None:
        with open(options.output_file, 'w') as out:
            write_conll(documents, out=out)

    if options.ner_model_dir is not None:
        save_ner_model(options.ner_model_dir, ner_model, decoder, tokenizer,
                       word_labels, config)

    return 0
checkpointer = ModelCheckpoint(filepath=model_file,
                               verbose=1,
                               save_best_only=True)
tensorboard = TensorBoard(
    log_dir="./logs-multifilter-large-morehidden-moredrop-task1",
    write_graph=False)
model.fit(X_train,
          Y_train,
          batch_size=batch_size,
          nb_epoch=nb_epoch,
          validation_data=(X_test, Y_test),
          callbacks=[stopping, checkpointer, tensorboard])

probabilities = model.predict(X_test, batch_size=batch_size)
predictions = probabilities.argmax(axis=-1)
idx2label = load_labels(task1_labels_file)
#with open('cnn_predictions.txt', 'w') as g:
#    for i in xrange(len(y_test)):
#        g.write(' '.join([str(v) for v in X_test[i]]) + '\t' + idx2label.get(y_test[i], 'ERROR') + '\t' + idx2label.get(predictions[i], 'ERROR') + '\n')
print('Performance of final model (not necessarily best model):')
print('========================================================')
cm = confusion_matrix(y_test, predictions)
print('Confusion matrix:')
print(cm)
acc = accuracy_score(y_test, predictions)
print('Accuracy score:')
print(acc)
labels = [label for (idx, label) in sorted(idx2label.items())]
score_report = classification_report(y_test, predictions, target_names=labels)
print('Score report:')
print(score_report)
Exemplo n.º 7
0
        if arg in ("-w", "--write"):
            arg_write = True

    #initialize objects for further use
    model = model.Model()
    data = data.Data()

    x_train, y_train, x_test, y_test = data.get_dataset(params.DATASET_PATH,
                                                        train=False,
                                                        one_hot=True)
    n_objects_train, n_objects_test = data.get_amount_objects_per_category()

    if arg_multi:
        x_train, y_train, x_test, y_test = data.single_to_multi_view(
            x_train, y_train, x_test, y_test, params.N_VIEWS)
    labels = data.load_labels()

    _, _, _, group_ids, _, correct_predictions, _ = model.predict(
        x_test, y_test, get_saliency=False, get_activations=False)

    #split correct_predictions in lists of if classification is correct and classified label id
    is_correct = correct_predictions[:, 0]
    correct_label_ids = correct_predictions[:, 1]

    if params.DATASET_IS_SINGLELABEL:
        #if write argument is given create file and write results
        if arg_write:
            path = os.path.join(params.RESULTS_PATH, "models",
                                os.path.basename(params.CKPT_PATH))
            f = open(os.path.join(path, "stats.txt"), "w")
            f.write("Overall Accuracy: {:.3f}\n".format(np.mean(is_correct)))
Exemplo n.º 8
0
import os

import data

filename = 'trainLabels.csv'

# ROOT_PATH is the path of your python project.
ROOT_PATH = "/Users/joverlyngaudillo/Desktop/diabetic-retinopathy"

# data_directory is the full path of your dataset
data_directory = os.path.join(ROOT_PATH, "INPUTDATA")

# Fills the image_filenames and image_classes lists by
# calling the function load_labels() from data.py
image_filenames, image_classes = data.load_labels(filename)

# Iterates over the INPUTDATA directory
for f in os.listdir(data_directory):
    # Filters the content of the data_directory by choosing
    # files that with .jpeg format
    if f.endswith('.jpeg'):

        # file_names is a list paths of classes directories
        file_names = os.path.join(data_directory, f)

        img = f.replace('.jpeg', '')

        # Iterates over the image_filenames lists
        for name in image_filenames:
            if img == name:
                break
Exemplo n.º 9
0
                                           staircase=True)

########################################
########################################
#####                              #####
#####    Load & preprocess data    #####
#####                              #####
########################################
########################################

print('Loading processed data...')
start = time.time()

data_imgs = data.load_imgs(args.imgs_feat_path)
data_text = np.array(data.load_text('pascal/train.mat'))
labels_text = labels_imgs = data.load_labels(
    'VOCdevkit/VOC2007/ImageSets/Main', 5011, 20)

valid_idx = (np.sum(data_text, axis=1) != np.zeros(data_text.shape[0]))

imgs_scaler = preprocessing.StandardScaler()
text_scaler = preprocessing.StandardScaler()

data_imgs = imgs_scaler.fit_transform(data_imgs[valid_idx])
data_text = text_scaler.fit_transform(data_text[valid_idx])
labels_text = labels_imgs = labels_text[valid_idx]

IMG_SIZE = len(data_imgs[0])
TEXT_SIZE = len(data_text[0])
LABEL_SIZE = len(labels_text[0])

data = list(data_imgs) + list(data_text)