def main(): # Set hyper-parameters. batch_size = 32 epochs = 100 model_path = 'models/model_{}.h5' num_words = 15000 # Data loading. x, y = load_dataset('./data/ja.wikipedia.conll') # Pre-processing. x = preprocess_dataset(x) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) source_vocab = Vocab(num_words=num_words, oov_token='<UNK>').fit(x_train) target_vocab = Vocab(lower=False).fit(y_train) x_train = create_dataset(x_train, source_vocab) y_train = create_dataset(y_train, target_vocab) # Build models. models = [ UnidirectionalModel(num_words, target_vocab.size).build(), BidirectionalModel(num_words, target_vocab.size).build(), ] for i, model in enumerate(models): model.compile(optimizer='adam', loss='sparse_categorical_crossentropy') # Preparing callbacks. callbacks = [ EarlyStopping(patience=3), ModelCheckpoint(model_path.format(i), save_best_only=True) ] # Train the model. model.fit(x=x_train, y=y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1, callbacks=callbacks, shuffle=True) # Inference. model = load_model(model_path.format(i)) api = InferenceAPI(model, source_vocab, target_vocab) y_pred = api.predict_from_sequences(x_test) print(classification_report(y_test, y_pred, digits=4))
def main(): # ハイパーパラメータの設定 batch_size = 32 epochs = 100 # model_path = 'models/unidirectional_model.h5' model_path = 'models/bidirectional_model.h5' num_words = 15000 # データ・セットの読み込み x, y = load_dataset('./data/ja.wikipedia.conll') # データ・セットの前処理 x = preprocess_dataset(x) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) source_vocab = Vocab(num_words=num_words, oov_token='<UNK>').fit(x_train) target_vocab = Vocab(lower=False).fit(y_train) x_train = create_dataset(x_train, source_vocab) y_train = create_dataset(y_train, target_vocab) # モデルの構築 # model = UnidirectionalModel(num_words, target_vocab.size).build() model = BidirectionalModel(num_words, target_vocab.size).build() model.compile(optimizer='adam', loss='sparse_categorical_crossentropy') # コールバックの用意 callbacks = [ EarlyStopping(patience=3), ModelCheckpoint(model_path, save_best_only=True) ] # モデルの学習 model.fit(x=x_train, y=y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1, callbacks=callbacks, shuffle=True) # 予測と評価 model = load_model(model_path) api = InferenceAPI(model, source_vocab, target_vocab) y_pred = api.predict_from_sequences(x_test) print(classification_report(y_test, y_pred, digits=4))
def sampleGenerator(midis, batch_size, fs=50, shuffle_piece=False, train=True, midi_batch=5): 'Generates batches of samples' # Infinite loop while 1: # Generate order of exploration of dataset # Generate batches #random.shuffle(midis,random.random) for i in range(int(len(midis) / midi_batch)): # Find list of IDs #midis_temp = midis[i*batch_size:(i+1)*batch_size] # Generate data X, y = create_dataset(midis[i * midi_batch:(i + 1) * midi_batch], fs=fs, poly=False) #X_train, X_test, y_train, y_test = train_test_split(X, y, # test_size=0.2, # random_state=7) X, y = randomize_set(X, y, seed=5) if train: X = X[:int(0.8 * len(X))] y = y[:int(0.8 * len(y))] X, y = randomize_set(X, y) imax = int(X.shape[0] / batch_size) - 1 indexes = np.arange(imax) for j in indexes: yield X[j * batch_size:(j + 1) * batch_size, :, :], np.squeeze( y[j * batch_size:(j + 1) * batch_size, :, :]) else: X = X[int(0.8 * len(X)):] y = y[int(0.8 * len(y)):] imax = int(X.shape[0] / batch_size) - 1 indexes = np.arange(imax) if imax == 0: yield X, np.squeeze(y) else: for j in indexes: yield X[j * batch_size:(j + 1) * batch_size, :, :], np.squeeze( y[j * batch_size:(j + 1) * batch_size, :, :]) #yield X, np.squeeze(y) if shuffle_piece: random.shuffle(indexes)
def main(): # hyper parameter setting emb_dim = 50 epochs = 2 model_path = 'model.h5' negative_samples = 1 num_words = 10000 window_size = 1 # corpus text = load_data(filepath = '../chap04/data/ja.text8') # vocablary vocab = build_vocablary(text, num_words) # create dataset x, y = create_dataset(text, vocab, num_words, window_size, negative_samples) # construction of model model = EmbeddingModel(num_words, emb_dim) model = model.build() model.compile(optimizer = 'adam', loss = 'binary_crossentropy') # callback callbacks = [ EarlyStopping(patience=1), ModelCheckpoint(model_path, save_best_only=True) ] # model model.fit(x=x,y=y, batch_size=128, epochs=epochs, validation_split=0.2, callbacks=callbacks) # prediction model = load_model(model_path) api = InferenceAPI(model, vocab) pprint(api.most_similar(word='日本'))
def train_from_batch(path_to_dir, num_in_batch, model, fs=50, midi_num=None, data_type='roll', poly=False): if midi_num: midi_files_num = len(os.listdir(path_to_dir)[:midi_num]) else: midi_files_num = len(os.listdir(path_to_dir)) file_list = os.listdir(path_to_dir) random.shuffle(file_list, random.random) num_files_in_batch = num_in_batch histories = [] for i in range(0, int(midi_files_num / num_files_in_batch) - 1): if data_type == 'roll': midis, first, last = parse_directory( path_to_dir, file_list[i * num_files_in_batch:i * num_files_in_batch + num_files_in_batch]) X, y = create_dataset(midis, fs=fs, poly=poly) else: events, encoded, X, y = parse_directory_for_events( path_to_dir, fs, file_list[i * num_files_in_batch:i * num_files_in_batch + num_files_in_batch]) print(start - time.time()) history = model.fit(X, np.squeeze(y), epochs=5, batch_size=128, validation_split=0.2) histories.append(history.history) X = None y = None return model, history, combine_history(histories)
# for i in range(5): # model,history = train_from_batch(path_to_directory,num_in_batch, model) # histories.append(history) end = time.time() - start print(end) #history = combine_history(histories) model.save(model_path + ".h5") plot_model(model, to_file=model_path + '.png') vis(history, model_path) else: path_to_directory = r"C:\Users\Maciek\Downloads\inputs" #path_to_directory=r"C:\Users\user\Desktop\Sound_generator\test" file_list = os.listdir(path_to_directory)[:1] midis, first, last = parse_directory(path_to_directory, file_list) X1, y1 = create_dataset(midis, fs) X, y = create_dataset_channels(midis, fs) folded = fold(X) midi_obj_from_roll = None for i, channel in enumerate(folded): monophonic_unsq = expand_roll(channel[1, :, :].T, delete_repress=repress) midi_obj_from_roll = piano_roll_to_midi_mono( monophonic_unsq.T, fs, midi=midi_obj_from_roll) midi_obj_from_roll.write( r'C:\Users\Maciek\Downloads\master-master\test_dur5.mid') # print("Dataset generated") # print(start - time.time()) # start_learning = time.time() # # keras = False
print('Sequences generated') return seeds, temperatures_high, temperatures_low, temperatures_mid if __name__ =='__main__': model_path = r"C:\Users\user\Desktop\Sound_generator\models\test.h5" #model_path = r"C:\Users\Maciek\Downloads\master-master\master-master\lstm_repress_filtered.h5" seed_file = r"C:\Users\user\Desktop\Sound_generator\piano_midi\bach_846.mid" #seed_file = r"C:\Users\Maciek\Downloads\inputs\bach_846.mid" #out_path=r"C:\Users\Maciek\Downloads\master-master\master-master\{}.mid" out_path=r"C:\Users\user\Desktop\Sound_generator\midis\{}.mid" fs=50 seq_len = 1000 midi_file = pretty_midi.PrettyMIDI(seed_file) midi_obj = MidiParser(midi_file) X,y = create_dataset(midi_obj, fs=fs) poly=False model = load_model(model_path) seeds, high, low, mid = generate(X, model, MAX_LEN, seq_len, poly, iters=5) all_notes = [] # for i,melody in enumerate(mid): # try: # new_notes = note_events_to_midi(np.squeeze(np.array(melody).T),'gen_mid_events{}.mid'.format(i), fs=50) # all_notes.append(new_notes) # except: # print('Wrong midi created for {}'.format(i)) for i,melody in enumerate(high): x = np.array(melody) x = expand_roll(np.squeeze(x).T, delete_repress=True).T seed = expand_roll(np.squeeze(seeds[i]).T, delete_repress=True).T
def main(): # Set hyper-parameters. batch_size = 32 epochs = 100 model_path = 'atmodel.h5' enc_arch = 'encoder.json' dec_arch = 'decoder.json' data_path = '../data/w16to19hukusimaconv.txt' num_words = 7000 num_data = 4367 # Data loading. en_texts, ja_texts = load_dataset(data_path) en_texts, ja_texts = en_texts[:num_data], ja_texts[:num_data] # Preprocessings. #ja_texts = preprocess_ja(ja_texts) ja_texts = preprocess_dataset(ja_texts) en_texts = preprocess_dataset(en_texts) x_train, x_test, y_train, y_test = train_test_split(en_texts, ja_texts, test_size=0.2, random_state=42) en_vocab = build_vocabulary(x_train, num_words) ja_vocab = build_vocabulary(y_train, num_words) print(x_train[:3]) print(y_train[:3]) x_train, y_train = create_dataset(x_train, y_train, en_vocab, ja_vocab) print(en_vocab.word_index) print(ja_vocab.word_index) # Build a simple model. encoder = Encoder(num_words) decoder = Decoder(num_words) # Build an attention model. #encoder = Encoder(num_words, return_sequences=True) #decoder = AttentionDecoder(num_words) seq2seq = Seq2seq(encoder, decoder) model = seq2seq.build() model.compile(optimizer='adam', loss='sparse_categorical_crossentropy') # Train the model. callbacks = [ EarlyStopping(patience=10), ModelCheckpoint(model_path, save_best_only=True, save_weights_only=True) ] """ model.fit(x=x_train, y=y_train, batch_size=batch_size, epochs=epochs, callbacks=callbacks, validation_split=0.1)""" encoder.save_as_json(enc_arch) decoder.save_as_json(dec_arch) # Inference. encoder = Encoder.load(enc_arch, model_path) decoder = Decoder.load(dec_arch, model_path) api = InferenceAPI(encoder, decoder, en_vocab, ja_vocab) #api = InferenceAPIforAttention(encoder, decoder, en_vocab, ja_vocab) texts = sorted(set(en_texts[:50]), key=len) texts = ["お聞きしたいと思います", "さっき の 答弁 全く 納得 できません", "全く 納得 い き ません", "ありがとうございました", "おはようございます",\ "よろしいでしょうか", "是非 よろしくお願いいたします", "もう少し 具体的に 教えて いただける と 助 か る んですけれども", "ちょっと 待 って", "質問 主 意 書 では 当然 混 同 は しておりません",\ "正 式 な 要求 でいい んですか", "時間ですので まとめて ください", "ちょっと 静粛に お願いします", "よろしいですか", "静粛に お願いします",\ "答弁 を まとめて ください", "時間 ですから", "驚 き の答弁 ですね", "それは いつ ごろ でしょうか", "そのとおり です" ] for text in texts: decoded = api.predict(text=text) print('入力: {}'.format(text)) print('応答: {}'.format(decoded)) y_test = [y.split(' ')[1:-1] for y in y_test] bleu_score = evaluate_bleu(x_test, y_test, api) print('BLEU: {}'.format(bleu_score))
from preprocessing import create_dataset from models import * import torch import utils #create dataset train_packs, train_picks, test_packs, test_picks = create_dataset() #initialize model with 249 cards and 15 archetypes rank_model = RankingNet(249, 15) optimizer = torch.optim.Adam(rank_model.parameters(), lr=0.1) #cross entropy loss function # --> this works well for this problem because we are optimizing # for a pick out of a set of options that can be described in loss_function = torch.nn.CrossEntropyLoss() #only consider picks where the player has likely solidified their #archetype (e.g., early in pack 2) train_x = torch.flatten(train_packs[:, 16:, :], start_dim=0, end_dim=1) train_y = torch.flatten(train_picks[:, 16:, :], start_dim=0, end_dim=1) #train the model utils.train(rank_model, loss_function, optimizer, train_x, train_y, epochs=5) torch.save(rank_model, 'Saved_Models/rank_model.pkl') #initialize drafting model with learned weights from rank model init_weights = rank_model.ranking_matrix.detach() #normalize the weights such that 1 is the largest initial weight smaller_init_weights = init_weights / init_weights.max(0, keepdim=True)[0] draft_model = DraftNet(smaller_init_weights) optimizer = torch.optim.Adam(draft_model.parameters(), lr=0.1) #flatten the drafts so that the algorithm only considers each pick #individually and remove archetype label to avoid leakage train_x = torch.flatten(train_packs, start_dim=0, end_dim=1)[:, 1:] train_y = torch.flatten(train_picks, start_dim=0, end_dim=1)
from preprocessing import create_dataset from models import * import torch import matplotlib.pyplot as plt import utils import sys #flags for train-test-split and saving the models #potential update: have these be command line params full_flag = False save = True #name = '_simple' #create dataset train_packs, train_picks, test_packs, test_picks = create_dataset( full_dataset=full_flag, save_clusters=save) #initialize model with 249 cards and 15 archetypes rank_model = RankingNet(249, 15) optimizer = torch.optim.Adam(rank_model.parameters(), lr=0.1) #cross entropy loss function # --> this works well for this problem because we are optimizing # for a pick out of a set of options that can be described in loss_function = torch.nn.CrossEntropyLoss() #only consider picks where the player has likely solidified their #archetype (e.g., early in pack 2) train_x = torch.flatten(train_packs[:, 16:, :], start_dim=0, end_dim=1) train_y = torch.flatten(train_picks[:, 16:, :], start_dim=0, end_dim=1) #train the model train_loss = utils.train(rank_model, loss_function, optimizer, train_x,
# Narrow down dataset to only selected country country = ' '.join(parsed.country) assert country in all_countries, f'\'{country}\' is not a valid choice' dataset = full_dataset.loc[country].price.values # Scale data to range of 0-1 dataset = np.reshape(dataset, (-1, 1)) scaler = MinMaxScaler(feature_range=(0, 1)) dataset = scaler.fit_transform(dataset) # Create X and Y sets train, test = preprocessing.split_dataset(dataset, ratio=0.8) seq_len = parsed.seq_len X_train, y_train = preprocessing.create_dataset(train, seq_len) X_test, y_test = preprocessing.create_dataset(test, seq_len) X_train = X_train.reshape((-1, seq_len, 1)) X_test = X_test.reshape((-1, seq_len, 1)) # Make training model neurons = parsed.hidden_neurons batch_sz = parsed.batch_size training_model = Sequential() training_model.add( LSTM(neurons, input_shape=(seq_len, 1), return_sequences=True)) training_model.add(LSTM(neurons, return_sequences=True)) training_model.add(Dense(1, activation='linear'))
# ハイパーパラメータの設定 emb_dim = 50 epochs = 10 model_path = 'model.h5' negative_samples = 1 num_words = 10000 window_size = 1 # コーパスの読み込み text = load_data(filepath='data/ja.text8') # ボキャブラリの構築 vocab = build_vocabulary(text, num_words) # データ・セットの作成 x, y = create_dataset(text, vocab, num_words, window_size, negative_samples) # モデルの構築 model = EmbeddingModel(num_words, emb_dim) model = model.build() model.compile(optimizer='adam', loss='binary_crossentropy') # コールバックの用意 callbacks = [ EarlyStopping(patience=1), ModelCheckpoint(model_path, save_best_only=True) ] # モデルの学習 model.fit(x=x, y=y,
def main(argv=None): trainX, testX, trainY, testY = preprocessing.create_dataset(0.1) eval(testX, testY)
exists = os.path.isfile(os.path.join(config.resources_dir, config.final_txt)) # load the final converted text containing the list of lists for the # training if it already exists if exists: fileinfo = os.stat(os.path.join(config.resources_dir, config.final_txt)) if fileinfo.st_size > 9500000: # print("jp2") print("final text file already exists loading it from %s..."\ %os.path.join(config.resources_dir, config.final_txt).split("/")[-1]) with open(os.path.join(config.resources_dir, config.final_txt),'r') as txtfile: training_list = json.load(txtfile) # else create it else: training_list = create_dataset(resources_dir=args.resources_dir, annotation_dict=args.annotation_dict, senseXml1=args.senseXml1, bn2wn_mapping_txt=args.bn2wn_mapping_txt) except: training_list = create_dataset(resources_dir=args.resources_dir, annotation_dict=args.annotation_dict, senseXml1=args.senseXml1, bn2wn_mapping_txt=args.bn2wn_mapping_txt) # create a dict containing the grid search parameters grid_params = {'min_count':config.min_count, 'window':config.window, 'size':config.size, 'sample':config.sample, 'alpha':config.alpha, 'min_alpha':config.min_alpha, 'negative':config.negative,
event, values = window.Read() selection = values['select'] window.Close() df = pd.read_csv('/data/exchange.csv', parse_dates=['date'], index_col='country').loc[selection].dropna() dataset = df.price.values dates = df.date.values dataset, scaler = preprocessing.normalize_dataframe(dataset) train, test = preprocessing.split_dataset(dataset, ratio=0.6) look_back = 30 X_train, Y_train = preprocessing.create_dataset(train, look_back) X_test, Y_test = preprocessing.create_dataset(test, look_back) # reshape input to be [samples, time steps, features] X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1])) X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1])) train_shape = X_train.shape # assert False X_train = tf.data.Dataset.from_tensor_slices((X_train, Y_train)) X_train = X_train.shuffle(10000).batch(64, drop_remainder=True) model = networks.build_model(train_shape, neurons=64,
def main(): # ハイパーパラメータの設定 batch_size = 32 epochs = 100 model_path = 'models/attention_model.h5' enc_arch = 'models/encoder.json' dec_arch = 'models/decoder.json' data_path = 'data/jpn.txt' num_words = 10000 num_data = 20000 # データ・セット読み込み en_texts, ja_texts = load_dataset(data_path) en_texts, ja_texts = en_texts[:num_data], ja_texts[:num_data] # データ・セットの前処理 ja_texts = preprocess_ja(ja_texts) ja_texts = preprocess_dataset(ja_texts) en_texts = preprocess_dataset(en_texts) x_train, x_test, y_train, y_test = train_test_split(en_texts, ja_texts, test_size=0.2, random_state=42) en_vocab = build_vocabulary(x_train, num_words) ja_vocab = build_vocabulary(y_train, num_words) x_train, y_train = create_dataset(x_train, y_train, en_vocab, ja_vocab) # モデルの構築 encoder = Encoder(num_words, return_sequences=True) decoder = AttentionDecoder(num_words) seq2seq = Seq2seq(encoder, decoder) model = seq2seq.build() model.compile(optimizer='adam', loss='sparse_categorical_crossentropy') # コールバックの用意 callbacks = [ EarlyStopping(patience=3), ModelCheckpoint(model_path, save_best_only=True, save_weights_only=True) ] # モデルの学習 model.fit(x=x_train, y=y_train, batch_size=batch_size, epochs=epochs, callbacks=callbacks, validation_split=0.1) encoder.save_as_json(enc_arch) decoder.save_as_json(dec_arch) # 予測 encoder = Encoder.load(enc_arch, model_path) decoder = Decoder.load(dec_arch, model_path) api = InferenceAPIforAttention(encoder, decoder, en_vocab, ja_vocab) texts = sorted(set(en_texts[:50]), key=len) for text in texts: decoded = api.predict(text=text) print('English : {}'.format(text)) print('Japanese: {}'.format(decoded)) # 性能評価 y_test = [y.split(' ')[1:-1] for y in y_test] bleu_score = evaluate_bleu(x_test, y_test, api) print('BLEU: {}'.format(bleu_score))
def run_models(words, models, verbose, train=True, test=True, embeddings=False): ''' Runs all the models that are specified with the specified word set. It runs all preporocessing steps necessary for the models specified Note: If a model is specified twice, it will be run twice, but the preprocessing on the input data will not(useful to test for model parameter initialization) Returns a list containing the the objects of the models used, the outputs they predicted and the sklearn classification reports (dictionary format), in the order where they were provided Keyword arguments: words: list of list of words and features. Format: n*m. n=nr of words, m=nr features + expected output (single) models: a string containing the model names. Order is not important. Possible models are: NB, LR, SVM, HMM, CRF. Coming soon: CNN If a model is specified twice, it will be run twice. The input is randomized only once, where applicable veboose: 0: print nothing 1: print results 2: print status messages: 3: print both ''' # Preparing data for one-hot encodign -- converts strings into integers if any(i in models for i in ['NB', 'LR', 'SVM']): verbose | 2 and print('Initial pre-processing...') if embeddings: stems = [word[0] for word in words] words = [word[1:] for word in words] X, Y, transl, labels_num, labels_name = create_dataset(words) #Algorithm uses sentences (list of list of tuples): HMM if 'HMM' in models: verbose | 2 and print('Preprocessing data for HMM...') sentences_hmm, symbols, tag_set = words2tuples(words) _, y_train, _, y_test = split_tr([], sentences_hmm, 0.8) x_test = [[tup[0] for tup in sentence] for sentence in y_test] y_test = [[tup[1] for tup in sentence] for sentence in y_test] #shuffle_parallel(x_test,y_test) data_hmm = data_wrap(None, y_train, x_test, y_test) # Algorithms using shuffled, one-hot data:NB,LR,SVM if any(i in models for i in ['NB', 'LR', 'SVM']): verbose | 2 and print('Preprocessing data for NB, LR and/or SVM...') indexes = shuffle_parallel(X, Y) X_onehot_sh = one_hot(X, transl) if embeddings: verbose | 2 and print('Loading and generating embeddings...') X_onehot_sh = embeddings.insert_embeddings(X_onehot_sh, stems, indexes) x_train_oh_sh, y_train_oh_sh, x_test_oh_sh, y_test_oh_sh = split_tr( X_onehot_sh, Y, 0.8) data_shuffled = data_wrap(x_train_oh_sh, y_train_oh_sh, x_test_oh_sh, y_test_oh_sh, transl, labels_num, labels_name) #Ordered, using sentences (list of list of dict): CRF if 'CRF' in models: verbose | 2 and print('Preprocessing data for CRF...') tokens_dict, labels_dict = words2dictionary(words) shuffle_parallel(tokens_dict, labels_dict) tokens_train, labels_train, tokens_test, labels_test = split_tr( tokens_dict, labels_dict, 0.8) data_dictionary = data_wrap(tokens_train, labels_train, tokens_test, labels_test) model_objects = [] model_results = [] model_predictions = [] #removes clutter when calling the functions separately #Using a list of function handlers could also be used, but I find that to be #less intuitive def _add_to_output(model_y_pred): model_objects.append(model_y_pred[0]) model_results.append(model_y_pred[1]) if (len(model_y_pred) > 2): model_predictions.append(model_y_pred[2]) #Run each of the models from the paramters, while KEEPING THE ORDER they were called in #and append it to the return lists for model in models: if 'HMM' in model: verbose | 2 and print('Running HMM from nltk...') _add_to_output(HMM(data_hmm, symbols, tag_set, verbose | 1)) if 'NB' in model: verbose | 2 and print('Running NB ' + ('with ' if embeddings else 'without ') + 'embeddings...') if embeddings: _add_to_output(NB_cont(data_shuffled, verbose | 1)) else: _add_to_output(NB_disc(data_shuffled, verbose | 1)) if 'LR' in model: verbose | 2 and print('Running LR ' + ('with ' if embeddings else 'without ') + 'embeddings...') _add_to_output( LR(data_shuffled, verbose | 1, C=(0.1 if embeddings else 5))) if 'SVM' in model: verbose | 2 and print('Running SVM ' + ('with ' if embeddings else 'without ') + 'embeddings...') _add_to_output(SVM(data_shuffled, verbose | 1)) if 'CRF' in model: verbose | 2 and print('Running CRF...') _add_to_output(CRF(data_dictionary, verbose | 1)) return model_objects, model_results, model_predictions