예제 #1
0
def generate_folds(iteration,current_attributes):
	#used when multiple repetitions are required
	
	#current attributes is used to store inforamtion (if pre_processing>0)
	
	for fold in range(0,n_folds):
		subprocess.call ("rm " + path_results+"it"+str(actual_iteration)+"/TrainFold"+str(fold), shell = True)
		subprocess.call ("rm " + path_results+"it"+str(actual_iteration)+"/TestFold"+str(fold), shell = True)
	if (loocv):
		validation.loocv_folds(path_results+"it"+str(actual_iteration)+"/"+name+"_it"+str(actual_iteration)+".arff",path_results+"it"+str(actual_iteration))
	else:
		if pre_processing > 0:
			#re-store all the information about the CURRENT AVAILABLE ATTRIBUTES (different than the original/starting set!)
			classes[:] = []
			data_matrix[:] = []
			data_labels[:] = []
			attributes_type.clear()
			attribute_indexes.clear()
			binarised_attribute_mapping.clear()
			isCategorical.clear()
			attribute_definitions[:] = []
			relation_name, class_name = datasetParser.get_attributes_info(path_results+"it"+str(actual_iteration)+"/"+name+"_it"+str(actual_iteration)+".arff", attribute_definitions, attributes_type, isCategorical, attribute_indexes)
		
		validation.cvs_folds(path_results+"it"+str(actual_iteration)+"/"+name+"_it"+str(actual_iteration)+".arff",path_results+"it"+str(actual_iteration), categorical_attributes, binarised_attribute_mapping, n_folds, db_scv)
		if  pre_processing > 0:
			#pre-process the folds if necessary
			preprocessing.preprocess_dataset(pre_processing,path_results, actual_iteration, dataset_name, n_folds, binarised_attribute_mapping, isCategorical, relation_name)
			if iteration != 0:
				validation.fix_new_folds (iteration, current_attributes, path_results, n_folds)
	return	
예제 #2
0
def predict(model, dataset, training_params, flip, verbose=True):
    # New epoch
    dataset.on_new_epoch()
    N = training_params.Ntest/training_params.test_batch_size
    if N==0:
        raise Exception("Ntest = 0.")
    for i in range(N):
        if verbose:
            print "\rBatch %d over %d"%(i,N),
        # Get next batch
        batch,targets= dataset.get_batch()
        # Eventually flip
        if flip:
            batch = np.fliplr(batch.transpose(1,2,3,0)).transpose(3,0,1,2)
        # Preprocess
        for mode in training_params.valid_preprocessing:
            batch = preprocess_dataset(batch, training_params, mode)
        # Predict
        if type(model) is Graph:
            pred = model.predict({"input":np.array(batch.transpose(0,3,1,2), "float32")})["output"]
        else:
            pred = model.predict(np.array(batch.transpose(0,3,1,2), "float32"))
        # Accumulate preds
        if i==0:
            predictions = np.copy(pred)
            labels = np.copy(convert_labels(targets))
        else:
            predictions = np.concatenate((predictions,pred))
            labels = np.concatenate((labels,convert_labels(targets)))

    return predictions, labels
예제 #3
0
def get_features(model, dataset, position, N, training_params, verbose, flip=False):

    intermediate_outputs = K.function([model.layers[0].input], [model.layers[position].get_output(train=False)])

    if N==0:
        raise Exception("Ntest = 0.")
    for i in range(N):
        if verbose:
            print "\rBatch %d over %d"%(i,N),
        # Get next batch
        batch,targets= dataset.get_batch()
        # Eventually flip
        if flip:
            batch = np.fliplr(batch.transpose(1,2,3,0)).transpose(3,0,1,2)
        # Preprocess
        for mode in training_params.valid_preprocessing:
            batch = preprocess_dataset(batch, training_params, mode)
        # Predict
        pred = intermediate_outputs([np.array(batch.transpose(0,3,1,2), "float32")])[0]
        if pred.shape[1] != 256 and pred.shape[1] != 512:
            raise Exception("not the good layer. Dim = %d"%pred.shape[1])
        # Accumulate preds
        if i==0:
            predictions = np.copy(pred)
            labels = np.copy(convert_labels(targets))
        else:
            predictions = np.concatenate((predictions,pred))
            labels = np.concatenate((labels,convert_labels(targets)))

    return predictions, labels
예제 #4
0
def face_recognition(dataset):
    dataset = prep.preprocess_dataset(dataset)
    faces=list(map(fd.detect_face,dataset))
    N = int(np.max([i.shape[0] for i in faces if len(np.unique(i))!=1]))
    M = int(np.max([i.shape[1] for i in faces if len(np.unique(i))!=1]))
    standardise = list(map(partial(featd.standardize_border,N=N,M=M), faces))
    return standardise
예제 #5
0
def train_w2v(dict):

    dict = index_dataset()

    if os.path.exists('./objects/corpus_token_list.pickle'):
        token_list = load_object('./objects/corpus_token_list.pickle')
    else:
        # Preprocess dataset if needed
        dataset, corpus = preprocess_dataset(dict,
                                             lemmatize=True,
                                             remove_stopwords=True,
                                             measure_time=True)

        token_list = []
        for dataset_entry in dataset:
            token_list.append(list(dataset[dataset_entry][0].keys()))

        save_object(token_list, './objects/corpus_token_list.pickle')

    print(token_list[:5])

    # start = time.time()
    model = Word2Vec(token_list, size=100, window=3, min_count=1, workers=4)
    # print(time.time() - start)

    # print(model.most_similar('plan', topn=10))

    # print(model.wv['insurance'])

    return model
예제 #6
0
def predict(model, dataset, training_params, flip, verbose=True):
    # New epoch
    dataset.on_new_epoch()
    N = training_params.Ntest / training_params.test_batch_size
    if N == 0:
        raise Exception("Ntest = 0.")
    for i in range(N):
        if verbose:
            print "\rBatch %d over %d" % (i, N),
        # Get next batch
        batch, targets = dataset.get_batch()
        # Eventually flip
        if flip:
            batch = np.fliplr(batch.transpose(1, 2, 3,
                                              0)).transpose(3, 0, 1, 2)
        # Preprocess
        for mode in training_params.valid_preprocessing:
            batch = preprocess_dataset(batch, training_params, mode)
        # Predict
        if type(model) is Graph:
            pred = model.predict(
                {"input": np.array(batch.transpose(0, 3, 1, 2),
                                   "float32")})["output"]
        else:
            pred = model.predict(
                np.array(batch.transpose(0, 3, 1, 2), "float32"))
        # Accumulate preds
        if i == 0:
            predictions = np.copy(pred)
            labels = np.copy(convert_labels(targets))
        else:
            predictions = np.concatenate((predictions, pred))
            labels = np.concatenate((labels, convert_labels(targets)))

    return predictions, labels
예제 #7
0
def main():
    args = sys.argv

    batch_size = 128
    epochs = 100
    maxlen = 300
    model_path = 'models/cnn_model.h5'
    num_words = 40000
    num_label = 2

    x, y = load_dataset('data/amazon_reviews_multilingual_JP_v1_00.tsv')

    x = preprocess_dataset(x)
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)

    vocab = build_vocabulary(x_train, num_words)
    x_train = vocab.texts_to_sequences(x_train)
    x_test = vocab.texts_to_sequences(x_test)
    x_train = pad_sequences(x_train, maxlen=maxlen, truncating='post')
    x_test = pad_sequences(x_test, maxlen=maxlen, truncating='post')

    emb_flg = args[0]
    if emb_flg == 't':
        wv = load_fasttext('../chap08/models/cc.ja.300.vec.gz')
        wv = filter_embeddings(wv, vocab.word_index, num_words)
    else:
        wv = None

    model = CNNModel(num_words, num_label, embeddings=wv).build()
    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['acc'])

    callbacks = [
        EarlyStopping(patience=3),
        ModelCheckpoint(model_path, save_best_only=True)
    ]

    model.fit(x=x_train,
              y=y_train,
              batch_size=batch_size,
              epochs=epochs,
              validation_split=0.2,
              callbacks=callbacks,
              shuffle=True)

    model = load_model(model_path)
    api = InferenceAPI(model, vocab, preprocess_dataset)
    y_pred = api.predict_from_sequences(x_test)

    print('precision: {:.4f}'.format(
        precision_score(y_test, y_pred, average='binary')))
    print('recall   : {:.4f}'.format(
        recall_score(y_test, y_pred, average='binary')))
    print('f1   : {:.4f}'.format(f1_score(y_test, y_pred, average='binary')))
예제 #8
0
def main():
    # ハイパーパラメータの設定
    batch_size = 32
    epochs = 100
    # model_path = 'models/unidirectional_model.h5'
    model_path = 'models/'
    pretrained_model_name_or_path = 'cl-tohoku/bert-base-japanese-whole-word-masking'
    maxlen = 250

    # データ・セットの読み込み
    x, y = load_dataset('./data/ja.wikipedia.conll')
    # model = BertModel.from_pretrained (pretrained_model_name_or_path)
    # config =  BertConfig(pretrained_model_name_or_path)
    tokenizer = BertJapaneseTokenizer.from_pretrained(
        pretrained_model_name_or_path, do_word_tokenize=False)

    # データ・セットの前処理
    x = preprocess_dataset(x)
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)
    target_vocab = Vocab(lower=False).fit(y_train)
    features_train, labels_train = convert_examples_to_features(
        x_train,
        y_train,
        target_vocab,
        max_seq_length=maxlen,
        tokenizer=tokenizer)
    features_test, labels_test = convert_examples_to_features(
        x_test,
        y_test,
        target_vocab,
        max_seq_length=maxlen,
        tokenizer=tokenizer)

    # モデルの構築
    model = build_model(pretrained_model_name_or_path, target_vocab.size)
    model.compile(optimizer='sgd', loss=loss_func(target_vocab.size))

    # コールバックの用意
    callbacks = [
        EarlyStopping(patience=3),
    ]

    # モデルの学習
    model.fit(x=features_train,
              y=labels_train,
              batch_size=batch_size,
              epochs=epochs,
              validation_split=0.1,
              callbacks=callbacks,
              shuffle=True)

    # 予測と評価
    evaluate(model, target_vocab, features_test, labels_test)
예제 #9
0
def main():
    # Set hyper-parameters.
    batch_size = 32
    epochs = 100
    model_path = 'models/'
    pretrained_model_name_or_path = 'cl-tohoku/bert-base-japanese-whole-word-masking'
    maxlen = 250

    # Data loading.
    x, y = load_dataset('./data/ja.wikipedia.conll')
    tokenizer = BertJapaneseTokenizer.from_pretrained(
        pretrained_model_name_or_path, do_word_tokenize=False)

    # Pre-processing.
    x = preprocess_dataset(x)
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)
    target_vocab = Vocab(lower=False).fit(y_train)
    features_train, labels_train = convert_examples_to_features(
        x_train,
        y_train,
        target_vocab,
        max_seq_length=maxlen,
        tokenizer=tokenizer)
    features_test, labels_test = convert_examples_to_features(
        x_test,
        y_test,
        target_vocab,
        max_seq_length=maxlen,
        tokenizer=tokenizer)

    # Build model.
    model = build_model(pretrained_model_name_or_path, target_vocab.size)
    model.compile(optimizer='sgd', loss=loss_func(target_vocab.size))

    # Preparing callbacks.
    callbacks = [
        EarlyStopping(patience=3),
    ]

    # Train the model.
    model.fit(x=features_train,
              y=labels_train,
              batch_size=batch_size,
              epochs=epochs,
              validation_split=0.1,
              callbacks=callbacks,
              shuffle=True)
    model.save_pretrained(model_path)

    # Evaluation.
    evaluate(model, target_vocab, features_test, labels_test)
예제 #10
0
def main():
    batch_size = 128
    epochs = 100
    maxlen = 300
    model_path = "cnn_model.h5"
    num_words = 40000
    num_label = 2

    x, y = load_dataset("data/amazon_reviews_multilingual_JP_v1_00.tsv")

    x = preprocess_dataset(x)
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)
    vocab = build_vocabulary(x_train, num_words)
    x_train = vocab.texts_to_sequences(x_train)
    x_test = vocab.texts_to_sequences(x_test)
    x_train = pad_sequences(x_train, maxlen=maxlen, truncating="post")
    x_test = pad_sequences(x_test, maxlen=maxlen, truncating="post")

    wv = load_fasttext("data/cc.ja.300.vec.gz")
    wv = filter_embeddings(wv, vocab.word_index, num_words)

    model = CNNModel(num_words, num_label, embeddings=wv).build()
    model.compile(optimizer="adam",
                  loss="sparse_categorical_crossentropy",
                  metrics=["acc"])

    callbakcs = [
        EarlyStopping(patience=3),
        ModelCheckpoint(model_path, save_best_only=True)
    ]

    model.fit(x=x_train,
              y=y_train,
              batch_size=batch_size,
              epochs=epochs,
              validation_split=0.2,
              callbacks=callbakcs,
              shuffle=True)

    model = load_model(model_path)
    api = InferenceAPI(model, vocab, preprocess_dataset)
    y_pred = api.predict_from_sequence(x_test)
    print("precision: {:.4f}".format(
        precision_score(y_test, y_pred, average="binary")))
    print("recall: {:.4f}".format(
        recall_score(y_test, y_pred, average="binary")))
    print("f1: {:.4f}".format(f1_score(y_test, y_pred, average="binary")))
예제 #11
0
def main():
    # Set hyper-parameters.
    batch_size = 32
    epochs = 100
    model_path = 'models/model_{}.h5'
    num_words = 15000

    # Data loading.
    x, y = load_dataset('./data/ja.wikipedia.conll')

    # Pre-processing.
    x = preprocess_dataset(x)
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)
    source_vocab = Vocab(num_words=num_words, oov_token='<UNK>').fit(x_train)
    target_vocab = Vocab(lower=False).fit(y_train)
    x_train = create_dataset(x_train, source_vocab)
    y_train = create_dataset(y_train, target_vocab)

    # Build models.
    models = [
        UnidirectionalModel(num_words, target_vocab.size).build(),
        BidirectionalModel(num_words, target_vocab.size).build(),
    ]
    for i, model in enumerate(models):
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

        # Preparing callbacks.
        callbacks = [
            EarlyStopping(patience=3),
            ModelCheckpoint(model_path.format(i), save_best_only=True)
        ]

        # Train the model.
        model.fit(x=x_train,
                  y=y_train,
                  batch_size=batch_size,
                  epochs=epochs,
                  validation_split=0.1,
                  callbacks=callbacks,
                  shuffle=True)

        # Inference.
        model = load_model(model_path.format(i))
        api = InferenceAPI(model, source_vocab, target_vocab)
        y_pred = api.predict_from_sequences(x_test)
        print(classification_report(y_test, y_pred, digits=4))
예제 #12
0
def main():
    # ハイパーパラメータの設定
    batch_size = 32
    epochs = 100
    # model_path = 'models/unidirectional_model.h5'
    model_path = 'models/bidirectional_model.h5'
    num_words = 15000

    # データ・セットの読み込み
    x, y = load_dataset('./data/ja.wikipedia.conll')

    # データ・セットの前処理
    x = preprocess_dataset(x)
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)
    source_vocab = Vocab(num_words=num_words, oov_token='<UNK>').fit(x_train)
    target_vocab = Vocab(lower=False).fit(y_train)
    x_train = create_dataset(x_train, source_vocab)
    y_train = create_dataset(y_train, target_vocab)

    # モデルの構築
    # model = UnidirectionalModel(num_words, target_vocab.size).build()
    model = BidirectionalModel(num_words, target_vocab.size).build()
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

    # コールバックの用意
    callbacks = [
        EarlyStopping(patience=3),
        ModelCheckpoint(model_path, save_best_only=True)
    ]

    # モデルの学習
    model.fit(x=x_train,
              y=y_train,
              batch_size=batch_size,
              epochs=epochs,
              validation_split=0.1,
              callbacks=callbacks,
              shuffle=True)

    # 予測と評価
    model = load_model(model_path)
    api = InferenceAPI(model, source_vocab, target_vocab)
    y_pred = api.predict_from_sequences(x_test)
    print(classification_report(y_test, y_pred, digits=4))
예제 #13
0
def train():
    df_tweets = pd.read_csv("data/df_tweets", index_col=0)
    df_tweets["text"] = preprocess_dataset(df_tweets["text"])
    df_tweets = df_tweets.dropna(how='any')
    df_tweets = df_tweets.drop(df_tweets.index[df_tweets["Irrelevant"] == 1])

    x = df_tweets["text"]
    # y = df_tweets[["posi_and_nega", "posi", "nega", "neutral", "Irrelevant"]]
    y = df_tweets[["posi_and_nega", "posi", "nega", "neutral"]]
    y = np.asarray(y)
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)
    vocab = build_vocabulary(x_train, num_words)
    with open('model/tokenizer.pickle', 'wb') as handle:
        pickle.dump(vocab, handle, protocol=pickle.HIGHEST_PROTOCOL)
    x_train = vocab.texts_to_sequences(x_train)
    x_test = vocab.texts_to_sequences(x_test)
    x_train = pad_sequences(x_train, maxlen=maxlen, truncating="post")
    x_test = pad_sequences(x_test, maxlen=maxlen, truncating="post")

    wv = KeyedVectors.load("model/word2vec.model", mmap='r')
    wv = filter_embeddings(wv, vocab.word_index, num_words)

    model = CNNModel(num_words, num_label, embeddings=wv).build()
    model.compile(optimizer="adam",
                  loss="binary_crossentropy",
                  metrics=["acc"])

    callbakcs = [
        EarlyStopping(patience=3),
        ModelCheckpoint(model_path, save_best_only=True)
    ]

    model.fit(x=x_train,
              y=y_train,
              batch_size=batch_size,
              epochs=epochs,
              validation_split=0.5,
              callbacks=callbakcs,
              shuffle=True)
예제 #14
0
def predict(text):
    wv = KeyedVectors.load("model/word2vec.model", mmap='r')
    model = load_model("model/cnn_model.h5")
    with open('tokenizer.pickle', 'rb') as handle:
        vocab = pickle.load(handle)

    # t = Tokenizer(wakati=True)
    # tokenized_text = " ".join(t.tokenize(str(text)))
    text = [text]
    tokenized_text = preprocess_dataset(text)
    text_sequences = vocab.texts_to_sequences(tokenized_text)
    text_sequences = pad_sequences(text_sequences,
                                   maxlen=maxlen,
                                   truncating="post")
    print("--------------------------------------------")
    print(text_sequences.shape)
    print("--------------------------------------------")
    print(tokenized_text)
    print("--------------------------------------------")
    return model.predict(text_sequences)
예제 #15
0
def get_features(model,
                 dataset,
                 position,
                 N,
                 training_params,
                 verbose,
                 flip=False):

    intermediate_outputs = K.function(
        [model.layers[0].input],
        [model.layers[position].get_output(train=False)])

    if N == 0:
        raise Exception("Ntest = 0.")
    for i in range(N):
        if verbose:
            print "\rBatch %d over %d" % (i, N),
        # Get next batch
        batch, targets = dataset.get_batch()
        # Eventually flip
        if flip:
            batch = np.fliplr(batch.transpose(1, 2, 3,
                                              0)).transpose(3, 0, 1, 2)
        # Preprocess
        for mode in training_params.valid_preprocessing:
            batch = preprocess_dataset(batch, training_params, mode)
        # Predict
        pred = intermediate_outputs(
            [np.array(batch.transpose(0, 3, 1, 2), "float32")])[0]
        if pred.shape[1] != 256 and pred.shape[1] != 512:
            raise Exception("not the good layer. Dim = %d" % pred.shape[1])
        # Accumulate preds
        if i == 0:
            predictions = np.copy(pred)
            labels = np.copy(convert_labels(targets))
        else:
            predictions = np.concatenate((predictions, pred))
            labels = np.concatenate((labels, convert_labels(targets)))

    return predictions, labels
예제 #16
0
def main():
    # ハイパーパラメータの設定
    batch_size = 32
    epochs = 100
    model_path = 'models/attention_model.h5'
    enc_arch = 'models/encoder.json'
    dec_arch = 'models/decoder.json'
    data_path = 'data/jpn.txt'
    num_words = 10000
    num_data = 20000

    # データ・セット読み込み
    en_texts, ja_texts = load_dataset(data_path)
    en_texts, ja_texts = en_texts[:num_data], ja_texts[:num_data]

    # データ・セットの前処理
    ja_texts = preprocess_ja(ja_texts)
    ja_texts = preprocess_dataset(ja_texts)
    en_texts = preprocess_dataset(en_texts)
    x_train, x_test, y_train, y_test = train_test_split(en_texts,
                                                        ja_texts,
                                                        test_size=0.2,
                                                        random_state=42)

    en_vocab = build_vocabulary(x_train, num_words)
    ja_vocab = build_vocabulary(y_train, num_words)
    x_train, y_train = create_dataset(x_train, y_train, en_vocab, ja_vocab)

    # モデルの構築
    encoder = Encoder(num_words, return_sequences=True)
    decoder = AttentionDecoder(num_words)
    seq2seq = Seq2seq(encoder, decoder)
    model = seq2seq.build()
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

    # コールバックの用意
    callbacks = [
        EarlyStopping(patience=3),
        ModelCheckpoint(model_path,
                        save_best_only=True,
                        save_weights_only=True)
    ]

    # モデルの学習
    model.fit(x=x_train,
              y=y_train,
              batch_size=batch_size,
              epochs=epochs,
              callbacks=callbacks,
              validation_split=0.1)
    encoder.save_as_json(enc_arch)
    decoder.save_as_json(dec_arch)

    # 予測
    encoder = Encoder.load(enc_arch, model_path)
    decoder = Decoder.load(dec_arch, model_path)
    api = InferenceAPIforAttention(encoder, decoder, en_vocab, ja_vocab)
    texts = sorted(set(en_texts[:50]), key=len)
    for text in texts:
        decoded = api.predict(text=text)
        print('English : {}'.format(text))
        print('Japanese: {}'.format(decoded))

    # 性能評価
    y_test = [y.split(' ')[1:-1] for y in y_test]
    bleu_score = evaluate_bleu(x_test, y_test, api)
    print('BLEU: {}'.format(bleu_score))
예제 #17
0
    if len(sys.argv) != 4:
        print('Use case: {} <params file> <search term> <model write location>'.format(
            sys.argv[0]))
        quit()

    data_path = os.path.join('..', 'data')
    try:
        os.makedirs(data_path)
    except FileExistsError:
        tqdm.write('data directory exists, continuing...')

    # generate dataset for training
    scrape.generate_dataset(sys.argv[1], sys.argv[2], data_path)

    # preprocess data
    preprocessing.preprocess_dataset(os.path.join(
        data_path, sys.argv[2]), IMG_WIDTH, IMG_HEIGHT)
    preprocessing.preprocess_dataset(os.path.join(
        data_path, 'NOT-{}'.format(sys.argv[2])), IMG_WIDTH, IMG_HEIGHT)

    # define datasets
    image_generator = tf.keras.preprocessing.image.ImageDataGenerator(
        rescale=1./255,
        validation_split=0.3,
    )

    train_generator = image_generator.flow_from_directory(directory=data_path,
                                                          target_size=(
                                                              IMG_HEIGHT, IMG_WIDTH),
                                                          classes=[
                                                              sys.argv[2], 'NOT-{}'.format(sys.argv[2])],
                                                          subset='training',
예제 #18
0
def main():

    # Read dataset
    dict = index_dataset()

    # Preprocess dataset if needed
    if not os.path.exists('./objects/indexer.pickle') or not os.path.exists(
            './objects/knn.pickle'):
        dataset, corpus = preprocess_dataset(dict,
                                             lemmatize=True,
                                             remove_stopwords=True,
                                             measure_time=True)

    # Load or create indexer
    if os.path.exists('./objects/indexer.pickle'):
        indexer = load_object('./objects/indexer.pickle')
    else:
        indexer = Indexer(dataset, measure_time=True)
        save_object(indexer, './objects/indexer.pickle')

    #Load or create KNN
    if os.path.exists('./objects/knn.pickle'):
        knn = load_object('./objects/knn.pickle')
    else:
        # Initialize KNN with given dataset
        knn = KNN(dataset, corpus, measure_time=True)
        save_object(knn, './objects/knn.pickle')

    # Main loop for user input
    print("Type a question:")
    q = input()
    while q != 'quit':

        processed_input = preprocess_input(q,
                                           lemmatize=True,
                                           remove_stopwords=True)

        terms_to_search_for = list(processed_input.keys())

        print('Terms to search for:')
        print(terms_to_search_for)
        print()

        containing_docs = indexer.retrieve_documents(terms_to_search_for,
                                                     measure_time=True)

        res = knn.find_nearest_neigbours(processed_input,
                                         containing_docs,
                                         k=10,
                                         measure_time=True)

        print("\nResults:\n")
        i = 1
        for r in res:
            print(f'#{i}')
            print(r)
            print()
            i += 1

        print("Type a question:")
        q = input()
예제 #19
0
    # main()

    dict = index_dataset()
    # print((corpus))
    # print(dict)
    # testing(dict)

    # Load or create indexer
    if os.path.exists('./objects/bot_nn10.pickle'):
        bot = load_object('./objects/bot_nn10.pickle')
    else:
        bot = QnABot()

        # use corpus to find typos in questions
        dataset, corpus = preprocess_dataset(dict,
                                             lemmatize=False,
                                             remove_stopwords=False,
                                             measure_time=True)

        bot.set_dataset(dict, dataset, corpus)
        save_object(bot, './objects/bot_nn10.pickle')

    q = ""
    while q != 'q':
        q = input("Your question(to quit enter q): ")

        # check for typos
        flag_typos = True
        all_incorrect = True
        # TODO
        # proveri da li je samo typo ili je cela recenica neka brljotina, ako je samo typo uradi levenshteina
        split_str = q.split(" ")
예제 #20
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.
    parser = HfArgumentParser((ModelArguments, DataTrainingArguments,
                               UDTrainingArguments, MultiLingAdapterArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args, adapter_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        (
            model_args,
            data_args,
            training_args,
            adapter_args,
        ) = parser.parse_args_into_dataclasses()

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    # Prepare for UD dependency parsing task
    labels = UD_HEAD_LABELS
    label_map: Dict[int, str] = {i: label for i, label in enumerate(labels)}
    num_labels = len(labels)

    config = AutoConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        id2label=label_map,
        label2id={label: i
                  for i, label in enumerate(labels)},
        cache_dir=model_args.cache_dir,
        pad_token_id=-1,
    )

    if model_args.is_japanese:
        assert model_args.mecab_dir is not None
        assert model_args.mecab_dic_dir is not None

    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast,
        do_lower_case=model_args.do_lower_case,
        add_prefix_space=True,  # Used e.g. for RoBERTa
        mecab_kwargs={
            "mecab_option":
            f"-r {model_args.mecab_dir} -d {model_args.mecab_dic_dir}"
        } if model_args.is_japanese else None,
    )

    # The task name (with prefix)
    task_name = "ud_" + data_args.task_name
    language = adapter_args.language

    model = AutoModelWithHeads.from_pretrained(
        model_args.model_name_or_path,
        config=config,
        cache_dir=model_args.cache_dir,
    )
    model.add_dependency_parsing_head(
        task_name,
        num_labels=num_labels,
        id2label=label_map,
    )

    if model_args.leave_out_twelvth:
        logger.info("Leaving out 12")
        leave_out = [11]
    else:
        leave_out = []

    # Setup adapters
    if adapter_args.train_adapter:
        # check if adapter already exists, otherwise add it
        if task_name not in model.config.adapters:
            # resolve the adapter config
            adapter_config = AdapterConfig.load(
                adapter_args.adapter_config,
                non_linearity=adapter_args.adapter_non_linearity,
                reduction_factor=adapter_args.adapter_reduction_factor,
                leave_out=leave_out,
            )
            # load a pre-trained from Hub if specified
            if adapter_args.load_adapter:
                model.load_adapter(
                    adapter_args.load_adapter,
                    config=adapter_config,
                    load_as=task_name,
                    leave_out=leave_out,
                )
            # otherwise, add a fresh adapter
            else:
                model.add_adapter(task_name, config=adapter_config)
        # optionally load a pre-trained language adapter
        if adapter_args.load_lang_adapter:
            # resolve the language adapter config
            lang_adapter_config = AdapterConfig.load(
                adapter_args.lang_adapter_config,
                non_linearity=adapter_args.lang_adapter_non_linearity,
                reduction_factor=adapter_args.lang_adapter_reduction_factor,
                leave_out=leave_out,
            )
            # load the language adapter from Hub
            lang_adapter_name = model.load_adapter(
                adapter_args.load_lang_adapter,
                config=lang_adapter_config,
                load_as=adapter_args.language,
                leave_out=leave_out,
            )
        else:
            lang_adapter_name = None
        # Freeze all model weights except of those of this adapter
        model.train_adapter([task_name])
        # Set the adapters to be used in every forward pass
        if lang_adapter_name:
            model.set_active_adapters(ac.Stack(lang_adapter_name, task_name))
        else:
            model.set_active_adapters(task_name)
    else:
        if adapter_args.load_adapter or adapter_args.load_lang_adapter:
            raise ValueError(
                "Adapters can only be loaded in adapters training mode."
                "Use --train_adapter to enable adapter training")

    # Load and preprocess dataset
    dataset = load_dataset("universal_dependencies", data_args.task_name)
    dataset = preprocess_dataset(dataset,
                                 tokenizer,
                                 labels,
                                 data_args,
                                 pad_token_id=-1)

    # Initialize our Trainer
    # HACK: Set this attribute to False to prevent label columns from being deleted
    training_args.remove_unused_columns = False
    trainer_class = DependencyParsingAdapterTrainer if adapter_args.train_adapter else DependencyParsingTrainer
    trainer = trainer_class(
        model=model,
        args=training_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["validation"],
    )

    # Training
    if training_args.do_train:
        trainer.train(model_path=model_args.model_name_or_path if os.path.
                      isdir(model_args.model_name_or_path) else None)
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_process_zero():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        result = trainer.evaluate()

        output_eval_file = os.path.join(training_args.output_dir,
                                        "eval_results.txt")
        if trainer.is_world_process_zero():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key, value in result.items():
                    logger.info("  %s = %s", key, value)
                    writer.write("%s = %s\n" % (key, value))

            results.update(result)

    # Predict
    if training_args.do_predict:
        logging.info("*** Test ***")

        if training_args.store_best_model:
            logger.info("Loading best model for predictions.")

            if adapter_args.train_adapter:
                if language:
                    lang_adapter_config = AdapterConfig.load(
                        config="pfeiffer",
                        non_linearity="gelu",
                        reduction_factor=2,
                        leave_out=leave_out)
                    model.load_adapter(
                        os.path.join(training_args.output_dir, "best_model",
                                     language) if training_args.do_train else
                        adapter_args.load_lang_adapter,
                        config=lang_adapter_config,
                        load_as=language,
                        leave_out=leave_out,
                    )
                task_adapter_config = AdapterConfig.load(config="pfeiffer",
                                                         non_linearity="gelu",
                                                         reduction_factor=16,
                                                         leave_out=leave_out)
                model.load_adapter(
                    os.path.join(training_args.output_dir, "best_model",
                                 task_name)
                    if training_args.do_train else adapter_args.load_adapter,
                    config=task_adapter_config,
                    load_as=task_name,
                    leave_out=leave_out,
                )
                if language:
                    model.set_active_adapters(
                        ac.Stack(lang_adapter_name, task_name))
                else:
                    model.set_active_adapters(task_name)
                model.to(training_args.device)
            else:
                trainer.model = AutoModelWithHeads.from_pretrained(
                    os.path.join(training_args.output_dir, "best_model"),
                    from_tf=bool(".ckpt" in model_args.model_name_or_path),
                    config=config,
                    cache_dir=model_args.cache_dir,
                ).to(training_args.device)

        predictions, _, metrics = trainer.predict(dataset["test"])

        output_test_results_file = os.path.join(training_args.output_dir,
                                                "test_results.txt")
        if trainer.is_world_process_zero():
            with open(output_test_results_file, "w") as writer:
                for key, value in metrics.items():
                    logger.info("  %s = %s", key, value)
                    writer.write("%s = %s\n" % (key, value))

    return results
예제 #21
0
	data_matrix = []
	data_labels = []
	attributes_type = {}
	attribute_indexes = {}
	
	isCategorical = {}
	attribute_definitions = []
	relation_name, class_name = datasetParser.get_attributes_info(dataset_name, attribute_definitions, attributes_type, isCategorical, attribute_indexes)
	
if (loocv):
	validation.loocv_folds(path_results+"it"+str(actual_iteration)+"/"+name+"_it"+str(actual_iteration)+".arff",path_results+"it"+str(actual_iteration))
else:
	validation.cvs_folds(path_results+"it"+str(actual_iteration)+"/"+name+"_it"+str(actual_iteration)+".arff",path_results+"it"+str(actual_iteration), categorical_attributes, binarised_attribute_mapping , n_folds, db_scv)
	
if pre_processing > 0:
	preprocessing.preprocess_dataset(pre_processing, path_results, actual_iteration, dataset_name, n_folds, binarised_attribute_mapping, isCategorical, relation_name)

#print information about the configuration
print(("Random Seed: {0}".format(current_seed)))
print("Configuration:")
print(("Dataset: {0}".format(name)))
print(("Num Atts: {0}".format(n_atts)))
print(("Num Samples: {0}".format(n_samples)))
print(("Tolerance value: {0}".format(tolerance)))
print(("Missing values: {0}".format(missing_values)))
print(("Categorical attributes: {0}".format(categorical_attributes)))
print(("Classification cost: {0}".format(cs_rf)))
if cs_rf == "yes":
	for i,value in enumerate(cost):
		print(("Cost of class {0} : {1}".format(i,value)))
print(("Block type: {0}".format(block_type)))
        'Buildings-Grass-Trees-Drives',
    ]
    label_dictionary = {
        0: 'Unclassified',
        1: 'Corn-notill',
        2: 'Corn-mintill',
        3: 'Grass-pasture',
        4: 'Grass-trees',
        5: 'Soybean-notill',
        6: 'Soybean-mintill',
        7: 'Soybean-clean',
        8: 'Woods',
        9: 'Buildings-Grass-Trees-Drives',
    }

    X, X_train, X_test, y_train, y_test = pp.preprocess_dataset(
        classes_authorized, components, compression_method, patch_size)

    # Training
    input_shape = X_train[0].shape
    print(input_shape)

    model, lr = choose_model(model_name, input_shape, num_classes)
    model.summary()

    model.fit(X_train,
              y_train,
              batch_size=batch_size,
              epochs=epochs,
              verbose=1)

    if not pruning_enabled and not quantization_enabled:
def main(datafile='./data/train_.pt',
         epochs=1000,
         learning_rate=1e-3,
         dim_out=10,
         device='cuda:0',
         project_name='em_showers_net_training',
         work_space='schattengenie',
         graph_embedder='GraphNN_KNN_v2',
         edge_classifier='EdgeClassifier_v1',
         patience=10):

    experiment = Experiment(project_name=project_name, workspace=work_space)

    early_stopping = EarlyStopping_(patience=patience, verbose=True)

    device = torch.device(device)
    showers = preprocess_dataset(datafile)
    showers_train, showers_test = train_test_split(showers, random_state=1337)

    train_loader = DataLoader(showers_train, batch_size=1, shuffle=True)
    test_loader = DataLoader(showers_test, batch_size=1, shuffle=True)

    k = showers[0].x.shape[1]
    print(k)
    graph_embedder = str_to_class(graph_embedder)(dim_out=dim_out,
                                                  k=k).to(device)
    edge_classifier = str_to_class(edge_classifier)(dim_out=dim_out).to(device)

    criterion = FocalLoss(gamma=2.)
    optimizer = torch.optim.Adam(list(graph_embedder.parameters()) +
                                 list(edge_classifier.parameters()),
                                 lr=learning_rate)

    loss_train = RunningAverageMeter()
    loss_test = RunningAverageMeter()
    roc_auc_test = RunningAverageMeter()
    pr_auc_test = RunningAverageMeter()
    acc_test = RunningAverageMeter()
    class_disbalance = RunningAverageMeter()

    for _ in tqdm(range(epochs)):
        for shower in train_loader:
            shower = shower.to(device)
            edge_labels_true, edge_labels_predicted = predict_one_shower(
                shower,
                graph_embedder=graph_embedder,
                edge_classifier=edge_classifier)
            # calculate the batch loss
            loss = criterion(edge_labels_predicted, edge_labels_true.float())
            # Zero gradients, perform a backward pass, and update the weights.
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            loss_train.update(loss.item())
            class_disbalance.update((edge_labels_true.sum().float() /
                                     len(edge_labels_true)).item())

        y_true_list = deque()
        y_pred_list = deque()
        for shower in test_loader:
            shower = shower.to(device)
            edge_labels_true, edge_labels_predicted = predict_one_shower(
                shower,
                graph_embedder=graph_embedder,
                edge_classifier=edge_classifier)

            # calculate the batch loss
            loss = criterion(edge_labels_predicted, edge_labels_true.float())
            y_true, y_pred = edge_labels_true.detach().cpu().numpy(
            ), edge_labels_predicted.detach().cpu().numpy()
            y_true_list.append(y_true)
            y_pred_list.append(y_pred)
            acc = accuracy_score(y_true, y_pred.round())
            roc_auc = roc_auc_score(y_true, y_pred)
            pr_auc = average_precision_score(y_true, y_pred)
            loss_test.update(loss.item())
            acc_test.update(acc)
            roc_auc_test.update(roc_auc)
            pr_auc_test.update(pr_auc)
            class_disbalance.update((edge_labels_true.sum().float() /
                                     len(edge_labels_true)).item())

        #f = plot_aucs(y_true=y_true, y_pred=y_pred)
        #experiment.log_figure("Optimization dynamic", f, overwrite=True)
        experiment_key = experiment.get_key()

        eval_loss = loss_test.val
        early_stopping(eval_loss, graph_embedder, edge_classifier,
                       experiment_key)

        ####
        if early_stopping.early_stop:
            print("Early stopping")
            break
        # TODO: save best
        #torch.save(graph_embedder.state_dict(), "graph_embedder_{}.pt".format(experiment_key))
        #torch.save(edge_classifier.state_dict(), "edge_classifier_{}.pt".format(experiment_key))

        experiment.log_metric('loss_test', loss_test.val)
        experiment.log_metric('acc_test', acc_test.val)
        experiment.log_metric('roc_auc_test', roc_auc_test.val)
        experiment.log_metric('pr_auc_test', pr_auc_test.val)
        experiment.log_metric('class_disbalance', class_disbalance.val)

        y_true = np.concatenate(y_true_list)
        y_pred = np.concatenate(y_pred_list)

    # load the last checkpoint with the best model
    graph_embedder.load_state_dict(
        torch.load("graph_embedder_{}.pt".format(experiment_key)))
    edge_classifier.load_state_dict(
        torch.load("edge_classifier_{}.pt".format(experiment_key)))
예제 #24
0
def main():
    # Set hyper-parameters.
    batch_size = 32
    epochs = 100
    model_path = 'atmodel.h5'
    enc_arch = 'encoder.json'
    dec_arch = 'decoder.json'
    data_path = '../data/w16to19hukusimaconv.txt'
    num_words = 7000
    num_data = 4367

    # Data loading.
    en_texts, ja_texts = load_dataset(data_path)
    en_texts, ja_texts = en_texts[:num_data], ja_texts[:num_data]

    # Preprocessings.
    #ja_texts = preprocess_ja(ja_texts)
    ja_texts = preprocess_dataset(ja_texts)
    en_texts = preprocess_dataset(en_texts)
    x_train, x_test, y_train, y_test = train_test_split(en_texts,
                                                        ja_texts,
                                                        test_size=0.2,
                                                        random_state=42)

    en_vocab = build_vocabulary(x_train, num_words)
    ja_vocab = build_vocabulary(y_train, num_words)
    print(x_train[:3])
    print(y_train[:3])
    x_train, y_train = create_dataset(x_train, y_train, en_vocab, ja_vocab)

    print(en_vocab.word_index)
    print(ja_vocab.word_index)

    # Build a simple model.
    encoder = Encoder(num_words)
    decoder = Decoder(num_words)
    # Build an attention model.
    #encoder = Encoder(num_words, return_sequences=True)
    #decoder = AttentionDecoder(num_words)
    seq2seq = Seq2seq(encoder, decoder)
    model = seq2seq.build()
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

    # Train the model.
    callbacks = [
        EarlyStopping(patience=10),
        ModelCheckpoint(model_path,
                        save_best_only=True,
                        save_weights_only=True)
    ]
    """
    model.fit(x=x_train,
              y=y_train,
              batch_size=batch_size,
              epochs=epochs,
              callbacks=callbacks,
              validation_split=0.1)"""
    encoder.save_as_json(enc_arch)
    decoder.save_as_json(dec_arch)

    # Inference.
    encoder = Encoder.load(enc_arch, model_path)
    decoder = Decoder.load(dec_arch, model_path)
    api = InferenceAPI(encoder, decoder, en_vocab, ja_vocab)
    #api = InferenceAPIforAttention(encoder, decoder, en_vocab, ja_vocab)
    texts = sorted(set(en_texts[:50]), key=len)
    texts = ["お聞きしたいと思います", "さっき の 答弁 全く 納得 できません", "全く 納得 い き ません", "ありがとうございました", "おはようございます",\
            "よろしいでしょうか", "是非 よろしくお願いいたします", "もう少し 具体的に 教えて いただける と 助 か る んですけれども", "ちょっと 待 って", "質問 主 意 書 では 当然 混 同 は しておりません",\
            "正 式 な 要求 でいい んですか", "時間ですので まとめて ください", "ちょっと 静粛に お願いします", "よろしいですか", "静粛に お願いします",\
            "答弁 を まとめて ください", "時間 ですから", "驚 き の答弁 ですね", "それは いつ ごろ でしょうか", "そのとおり です"
    ]
    for text in texts:
        decoded = api.predict(text=text)
        print('入力: {}'.format(text))
        print('応答: {}'.format(decoded))

    y_test = [y.split(' ')[1:-1] for y in y_test]
    bleu_score = evaluate_bleu(x_test, y_test, api)
    print('BLEU: {}'.format(bleu_score))
예제 #25
0
def main():
    # ハイパーパラメータの背一定
    batch_size = 128
    epochs = 100
    maxlen = 300
    # model_path = 'models/rnn_model.h5'
    # model_path = 'models/lstm_model.h5'
    # model_path = 'models/CNN_model.h5'
    model_path = 'models/latm_iniemb_model.h5'
    num_words = 4000
    num_label = 2

    # データ・セットの読み込み
    x, y = load_dataset('data/amazon_reviews_multilingual_JP_v1_00.tsv')

    # データセットの前処理
    x = preprocess_dataset(x)
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)
    vocab = build_vocabulary(x_train, num_words)
    x_train = vocab.texts_to_sequences(x_train)
    x_test = vocab.texts_to_sequences(x_test)
    x_train = pad_sequences(x_train, maxlen=maxlen, truncating='post')
    x_test = pad_sequences(x_test, maxlen=maxlen, truncating='post')

    # 単語分散表現
    wv = load_fasttext('data/cc.ja.300.vec')
    wv = filter_embeddings(wv, vocab.word_index, num_words)

    # モデルの構築
    # model = RNNModel(num_words, num_label, embeddings=None).build()
    model = LSTMModel(num_words, num_label, embeddings=wv).build()
    # model = CNNModel(num_words, num_label, embeddings=None).build()
    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['acc'])

    # コールバックの用意
    callbacks = [
        EarlyStopping(patience=3),
        ModelCheckpoint(model_path, save_best_only=True)
    ]

    # モデルの学習
    model.fit(x=x_train,
              y=y_train,
              batch_size=batch_size,
              epochs=epochs,
              validation_split=0.2,
              callbacks=callbacks,
              shuffle=True)

    # 予測
    model = load_model(model_path)
    api = InferenceAPI(model, vocab, preprocess_dataset)
    y_pred = api.predict_from_sequences(x_test)
    print('precision: {:.4f}'.format(
        precision_score(y_test, y_pred, average='binary')))
    print('recall: {:.4f}'.format(
        recall_score(y_test, y_pred, average='binary')))
    print('f1: {:.4f}'.format(f1_score(y_test, y_pred, average='binary')))
예제 #26
0
def launch_training(training_params):
    """
    Load the data, and train a Keras model.

    :param training_params: a TrainingParams object which contains each parameter of the training
    :return:
    """
    if os.path.exists(training_params.path_out) is False:
        os.mkdir(os.path.abspath(training_params.path_out))

    ###### LOADING VALIDATION DATA #######
    validset, valid_targets = load_dataset_in_memory_and_resize(training_params.data_access, "valid",
                                                                training_params.division, training_params.dataset_path,
                                                                training_params.targets_path, training_params.final_size,
                                                                training_params.final_size, training_params.test_batch_size)
    valid_targets = convert_labels(valid_targets)

    ###### Preprocessing VALIDATION DATA #######
    for mode in training_params.valid_preprocessing:
        validset = preprocess_dataset(validset, training_params, mode)
    # Transpose validset >> (N, channel, X, Y)
    validset = validset.transpose(0,3,1,2)
    # Multiple input ?
    if training_params.multiple_inputs>1:
        validset = [validset for i in range(training_params.multiple_inputs)]

    ###### MODEL INITIALIZATION #######
    with timer("Model initialization"):
        model = training_params.initialize_model()
    if training_params.pretrained_model is not None:
        with timer("Pretrained Model initialization"):
            pretrained_model = training_params.initialize_pretrained_model()
            training_params.generator_args.append(pretrained_model)
            # preprocessed the validset
            if type(pretrained_model) is list:
                features = []
                for pmodel in pretrained_model:
                    features.append(pmodel.predict(validset))
                validset = np.concatenate(features, axis=1)
            else:
                validset = pretrained_model.predict(validset)

    ###### SAVE PARAMS ######
    s = training_params.print_params()
    # Save command
    f = open(training_params.path_out+"/command.txt", "w")
    f.writelines(" ".join(sys.argv))
    f.writelines(s)
    f.close()
    # Print architecture
    print_architecture(model, path_out=training_params.path_out + "/architecture.txt")

    ###### TRAINING LOOP #######
    count = training_params.fine_tuning

    with timer("Training"):
        while training_params.learning_rate >= training_params.learning_rate_min and count<training_params.nb_max_epoch:

            if count != 0: # Restart from the best model with a lower LR
                model = training_params.initialize_model()
                model.load_weights(training_params.path_out+"/MEM_%d/best_model.cnn"%(count-1))
            # Callbacks
            early_stoping = EarlyStopping(monitor="val_loss",patience=training_params.max_no_best)
            save_model = ModelCheckpoint_perso(filepath=training_params.path_out+"/MEM_%d"%count, verbose=1,
                                               optional_string=s, monitor="val_acc", mode="acc")

            history = model.fit_generator(training_params.generator(*training_params.generator_args),
                                          nb_epoch=training_params.nb_max_epoch,
                                          samples_per_epoch= int(training_params.Ntrain*training_params.bagging_size),
                                          show_accuracy=True,
                                          verbose=training_params.verbose,
                                          validation_data=(validset,  valid_targets),
                                          callbacks=[early_stoping, save_model])

            training_params.learning_rate *= 0.1
            training_params.update_model_args()
            save_history(training_params.path_out+"/MEM_%d/history.pkl"%count, history)
            count += 1
예제 #27
0
def main(input_datafile='./data_new/data/200_test.pt',
         output_datafile='./data_new//data/200test_preprocessed.pt'):
    print("Lets start")
    showers = preprocess_dataset(input_datafile)
    torch.save(showers, output_datafile)
예제 #28
0
def launch_adversarial_training(training_params):
    """
    Load the data, and train a Keras model.

    :param training_params: a TrainingParams object which contains each parameter of the training
    :return:
    """
    if os.path.exists(training_params.path_out) is False:
        os.mkdir(os.path.abspath(training_params.path_out))

    ###### LOADING VALIDATION DATA #######
    validset, valid_targets = load_dataset_in_memory_and_resize(training_params.data_access, "valid", training_params.dataset_path,
                                                                training_params.targets_path, training_params.final_size,
                                                                training_params.final_size, training_params.test_batch_size)
    valid_targets = convert_labels(valid_targets)

    ###### Preprocessing VALIDATION DATA #######
    for mode in training_params.valid_preprocessing:
        validset = preprocess_dataset(validset, training_params, mode)
    # Transpose validset >> (N, channel, X, Y)
    validset = validset.transpose(0,3,1,2)
    # Multiple input ?
    if training_params.multiple_inputs>1:
        validset = [validset for i in range(training_params.multiple_inputs)]

    ###### MODEL INITIALIZATION #######
    with timer("Model initialization"):
        model = training_params.initialize_model()
    if training_params.pretrained_model is not None:
        with timer("Pretrained Model initialization"):
            pretrained_model = training_params.initialize_pretrained_model()
            training_params.generator_args.append(pretrained_model)
            # preprocessed the validset
            if type(pretrained_model) is list:
                features = []
                for pmodel in pretrained_model:
                    features.append(pmodel.predict(validset))
                validset = np.concatenate(features, axis=1)
            else:
                validset = pretrained_model.predict(validset)

    ###### SAVE PARAMS ######
    s = training_params.print_params()
    # Save command
    f = open(training_params.path_out+"/command.txt", "w")
    f.writelines(" ".join(sys.argv))
    f.writelines(s)
    f.close()
    # Print architecture
    print_architecture(model, path_out=training_params.path_out + "/architecture.txt")

    ###### TRAINING SET #######

    train_dataset = FuelDataset("train", training_params.tmp_size,
                                batch_size=training_params.batch_size,
                                bagging=training_params.bagging_size,
                                bagging_iterator=training_params.bagging_iterator)

    ###### ADVERSARIAL MAPPING ######

    input_ = model.layers[0].input
    y_ = model.y
    layer_output = model.layers[-1].get_output()
    xent = K.categorical_crossentropy(y_, layer_output)
    loss = xent.mean()
    grads = K.gradients(loss, input_)
    get_grads = K.function([input_, y_], [loss, grads])

    ###### TRAINING LOOP #######
    count = training_params.fine_tuning
    epoch_count = 0

    with timer("Training"):
        while training_params.learning_rate >= training_params.learning_rate_min and epoch_count<training_params.nb_max_epoch:

            if count != 0: # Restart from the best model with a lower LR
                model = training_params.initialize_model()
                model.load_weights(training_params.path_out+"/MEM_%d/best_model.cnn"%(count-1))
                # Recompile get_grads
                input_ = model.layers[0].input
                y_ = model.y
                layer_output = model.layers[-1].get_output()
                xent = K.categorical_crossentropy(y_, layer_output)
                loss = xent.mean()
                grads = K.gradients(loss, input_)
                get_grads = K.function([input_, y_], [loss, grads])

            best = 0.0
            patience = training_params.max_no_best
            losses = []
            adv_losses = []
            accuracies = []
            adv_accuracies = []
            valid_losses = []
            valid_accuracies = []
            epoch_count = 0
            no_best_count = 0
            path = training_params.path_out + "/MEM_%d"%count
            if os.path.exists(path) is False:
                os.mkdir(path)
            # Log file
            f = open(path+"/log.txt", "w")
            f.write("LR = %.2f\n"%training_params.learning_rate)
            f.close()
            # Config file
            open(path+"/config.netconf", 'w').write(model.to_json())

            while no_best_count < patience and epoch_count < training_params.nb_max_epoch:
                new = True
                loss = 0.0
                adv_loss = 0.0
                accuracy = 0.0
                adv_accuracy = 0.0
                # Trainset Loop
                N = training_params.Ntrain/(training_params.batch_size*1)
                for i in range(N):
                    # Train
                    print "\rEpoch %d : Batch %d over %d"%(epoch_count, i, N),
                    processed_batch, labels = get_next_batch(train_dataset, training_params.batch_size,
                                                             training_params.final_size,
                                                             training_params.preprocessing_func,
                                                             training_params.preprocessing_args)
                    l, acc = model.train_on_batch(processed_batch, labels, accuracy=True)
                    # Update stats
                    if new:
                        loss = l
                        accuracy = acc
                    else:
                        loss = 0.9*loss + 0.1*l
                        accuracy = 0.9*accuracy + 0.1*acc
                    # Get adversarial examples
                    l, grads = get_grads([processed_batch, labels])
                    updates = np.sign(grads)
                    adversarials = processed_batch + updates
                    # Train on adv examples
                    adv_l, adv_acc = model.train_on_batch(adversarials, labels, accuracy=True)
                    # Update stats
                    if new:
                        adv_loss = adv_l
                        adv_accuracy = adv_acc
                        new = False
                    else:
                        adv_loss = 0.9*adv_loss + 0.1*adv_l
                        adv_accuracy = 0.9*adv_accuracy + 0.1*adv_acc
                # Store stats
                losses.append(loss)
                accuracies.append(accuracy)
                adv_losses.append(adv_loss)
                adv_accuracies.append(adv_accuracy)
                # Validset loss and accuracy
                out = model.predict(validset)
                valid_loss = categorical_crossentropy(valid_targets, out)
                count = np.sum(np.argmax(valid_targets, axis=1) - np.argmax(out, axis=1) == 0)
                score = float(count)/valid_targets.shape[0]
                valid_losses.append(valid_loss)
                valid_accuracies.append(score)

                # Stop criterion and Save model
                string = "***\nEpoch %d: Loss : %0.5f, Adv loss : %0.5f, Valid loss : %0.5f, " \
                         "Acc : %0.5f, Adv acc : %0.5f, Valid acc : %0.5f"%(epoch_count, losses[-1], adv_losses[-1],
                                                                            valid_losses[-1], accuracies[-1],
                                                                            adv_accuracies[-1], valid_accuracies[-1])
                if score > best:
                    no_best_count = 0
                    save_path = path+"/best_model.cnn"
                    if training_params.verbose>0:
                        string = string +"\tBEST\n"
                        print string
                        write_log(path+"/log.txt", string)
                    best = score
                    model.save_weights(save_path, overwrite=True)
                else:
                    no_best_count += 1
                    save_path = path+"/last_epoch.cnn"
                    if training_params.verbose>0:
                        string = string + "\n"
                        print string
                        write_log(path+"/log.txt", string)
                    model.save_weights(save_path, overwrite=True)
                epoch_count += 1

            # Update learning rate
            training_params.learning_rate *= 0.1
            training_params.update_model_args()
            with open(path + "/history.pkl","w") as f:
                pickle.dump(losses,f)
                pickle.dump(adv_losses,f)
                pickle.dump(valid_losses,f)
                pickle.dump(accuracies,f)
                pickle.dump(adv_accuracies,f)
                pickle.dump(valid_accuracies,f)
            count += 1