def main(params=None): if params is None: params = { 'dataset': 'DRLD', 'exp_name': 'char_test', 'test_fold': 0, 'n_dev_folds': 1, 'min_doc_thresh': 1, 'initialize_word_vectors': True, 'vectors': 'chars_word2vec_25', # default_word2vec_300, anes_word2vec_300, chars_word2vec_25, eye_1 ... 'init_scale': 0.2, 'add_OOV_dim': True, 'win': 1, # size of context window 'add_DRLD': True, 'rnn_type': 'basic', # basic, GRU, or LSTM 'n_hidden': 50, # size of hidden units 'pooling_method': 'max', # max, mean, or attention1/2 'bidirectional': True, 'bi_combine': 'concat', # concat, max, or mean 'train_embeddings': True, 'lr': 0.1, # learning rate 'lr_emb_fac': 1, # factor to modify learning rate for embeddings 'decay_delay': 10, # number of epochs with no improvement before decreasing learning rate 'decay_factor': 0.5, # factor by which to multiply learning rate in case of delay 'n_epochs': 300, 'add_OOV_noise': True, 'OOV_noise_prob': 0.01, 'minibatch_size': 16, 'classify_minibatch_size': 64, 'ensemble': False, 'save_model': True, 'seed': 42, 'verbose': 1, 'reuse': False, 'orig_T': 0.04, 'tau': 0.01, 'clip_gradients': False } #params = fh.read_json('/Users/dcard/Projects/CMU/ARK/guac/experiments/best_mod.json') #params['exp_name'] += '_best' #params['n_hidden'] = int(params['n_hidden']) keys = params.keys() keys.sort() for key in keys: print key, ':', params[key] # seed the random number generators np.random.seed(params['seed']) random.seed(params['seed']) vector_type = params['vectors'].split('_')[0] params['word2vec_dim'] = int(params['vectors'].split('_')[-1]) reuser = None if params['reuse']: reuser = reusable_holdout.ReuseableHoldout(T=params['orig_T'], tau=params['tau']) if params['dataset'] == 'DRLD': datasets = ['Democrat-Likes', 'Democrat-Dislikes', 'Republican-Likes', 'Republican-Dislikes'] elif params['dataset'] == 'MIP': datasets = ['MIP-Personal-1', 'MIP-Personal-2', 'MIP-Political-1', 'MIP-Political-2'] elif params['dataset'] == 'MOLD': datasets = ['McCain-Likes', 'McCain-Dislikes', 'Obama-Likes', 'Obama-Dislikes'] elif params['dataset'] == 'Primary': datasets = ['Obama-Primary', 'Clinton-Primary'] elif params['dataset'] == 'General': datasets = ['Obama-General', 'McCain-General'] else: datasets = [params['dataset']] np.random.seed(params['seed']) random.seed(params['seed']) best_valid_f1s = [] best_true_valid_f1s = [] best_test_f1s = [] best_train_f1s = [] test_prediction_arrays = [] output_dir = fh.makedirs(defines.exp_dir, 'rnn', params['exp_name']) output_filename = fh.make_filename(output_dir, 'params', 'txt') fh.write_to_json(params, output_filename) for dev_fold in range(params['n_dev_folds']): print "dev fold =", dev_fold output_dir = fh.makedirs(defines.exp_dir, 'rnn', params['exp_name'], 'fold' + str(dev_fold)) if vector_type == 'chars': all_data, words2idx, items, all_labels = common.load_char_data(datasets, params['test_fold'], dev_fold) else: all_data, words2idx, items, all_labels = common.load_data(datasets, params['test_fold'], dev_fold, params['min_doc_thresh']) train_xy, valid_xy, test_xy = all_data train_lex, train_y = train_xy valid_lex, valid_y = valid_xy test_lex, test_y = test_xy #if params['minibatch_size'] > 1 or params['classify_minibatch_size'] > 1: print "padding input with zeros" all_data, all_masks = common.prepare_data(train_lex, valid_lex, test_lex) train_lex, valid_lex, test_lex = all_data train_masks, valid_masks, test_masks = all_masks #else: # train_masks = [np.ones(len(x)).astype('int32') for x in train_lex] # valid_masks = [np.ones(len(x)).astype('int32') for x in valid_lex] # test_masks = [np.ones(len(x)).astype('int32') for x in test_lex] print "expanding x with context win dows" # Rejigger to convert x to contex win in advance train_x_win = expand_x_with_context_win(train_lex, params['win']) valid_x_win = expand_x_with_context_win(valid_lex, params['win']) test_x_win = expand_x_with_context_win(test_lex, params['win']) order = range(len(train_lex)) print "done" train_items, dev_items, test_items = items vocsize = len(words2idx.keys()) idx2words = dict((k, v) for v, k in words2idx.iteritems()) best_test_predictions = None n_sentences = len(train_lex) print "vocsize = ", vocsize, 'n_train', n_sentences codes = all_labels.columns n_items, n_codes = all_labels.shape # get the words in the sentences for the test and validation sets words_valid = [map(lambda x: idx2words[x], w) for w in valid_lex] groundtruth_test = test_y[:] words_test = [map(lambda x: idx2words[x], w) for w in test_lex] #if vector_type == 'eye': # initial_embeddings = np.eye(vocsize) # emb_dim = initial_embeddings.shape[1] if params['initialize_word_vectors']: initial_embeddings = common.load_embeddings(params, words2idx) emb_dim = initial_embeddings.shape[1] else: initial_embeddings = None emb_dim = params['word2vec_dim'] print "embedding dim =", emb_dim temp_output = fh.make_filename(output_dir, 'embedding_labels', 'json') fh.write_to_json(idx2words, temp_output) extra_input_dims = 0 if params['add_DRLD']: extra_input_dims = 2 print "Building RNN" rnn = RNN(nh=params['n_hidden'], nc=n_codes, ne=vocsize, de=emb_dim, cs=params['win'], extra_input_dims=extra_input_dims, initial_embeddings=initial_embeddings, init_scale=params['init_scale'], rnn_type=params['rnn_type'], train_embeddings=params['train_embeddings'], pooling_method=params['pooling_method'], bidirectional=params['bidirectional'], bi_combine=params['bi_combine'], clip_gradients=params['clip_gradients'] ) temp_filename = fh.make_filename(output_dir, 'initial_embeddings', 'npy') rnn.save_embeddings(temp_filename) train_likes = [1 if re.search('Likes', i) else 0 for i in train_items] dev_likes = [1 if re.search('Likes', i) else 0 for i in dev_items] test_likes = [1 if re.search('Likes', i) else 0 for i in test_items] train_dem = [1 if re.search('Democrat', i) else 0 for i in train_items] dev_dem = [1 if re.search('Democrat', i) else 0 for i in dev_items] test_dem = [1 if re.search('Democrat', i) else 0 for i in test_items] train_extra = [[train_likes[i], train_dem[i]] for i, t in enumerate(train_items)] dev_extra = [[dev_likes[i], dev_dem[i]] for i, t in enumerate(dev_items)] test_extra = [[test_likes[i], test_dem[i]] for i, t in enumerate(test_items)] ### LOAD #rnn.load(output_dir) # train with early stopping on validation set best_f1 = -np.inf params['clr'] = params['lr'] for e in xrange(params['n_epochs']): # shuffle #shuffle([train_lex, train_y, train_extra, train_masks], params['seed']) # shuffle the input data shuffle([order, train_lex, train_y, train_extra, train_masks], params['seed']) # shuffle the input data params['ce'] = e # store the current epoch tic = timeit.default_timer() ms = params['minibatch_size'] n_train = len(train_lex) nll = 0 #for i, orig_x in enumerate(train_lex): for iteration, i in enumerate(range(0, n_train, ms)): #orig_x = train_lex[i] #n_words = len(orig_x) #if params['add_OOV_noise']: # draws = np.random.rand(n_words) # x = [OOV_index if draws[i] < params['OOV_noise_prob'] else orig_x[i] for i in range(n_words)] #else: # x = orig_x #y = train_y[i] extra = train_extra[i] #mask = train_masks[i] minibatch_x, minibatch_mask,\ minibatch_extra, minibatch_y= select_minibatch(train_x_win, train_masks, train_extra, train_y, params['win'], i, ms, order, params['add_OOV_noise'], params['OOV_noise_prob']) #if i == 0: # print '\n'.join([' '.join([idx2words[idx] for idx in minibatch_x[:, k, 0].tolist()]) for # k in range(ms)]) nll_i, a_sum = rnn.train(minibatch_x, minibatch_mask, minibatch_y, params['win'], params['clr'], params['lr_emb_fac'], extra_input_dims, minibatch_extra) nll += nll_i #rnn.train(x, mask, y, params['win'], params['clr'], params['lr_emb_fac'], # extra_input_dims, extra) print '[learning] epoch %i >> %2.2f%%' % ( e, (i + 1) * 100. / float(n_sentences)), print 'completed in %.2f (sec), nll = %.2f, a_sum = %.1f <<\r' % (timeit.default_timer() - tic, nll, np.max(a_sum)), sys.stdout.flush() if np.isnan(nll) or np.isinf(nll): if best_f1 > 0: break else: return {'loss': 1.0, 'final_test_f1': 0, 'valid_f1s': 0, 'true_valid_f1s': 0, 'train_f1s': 0, 'test_f1s': 0, 'status': STATUS_OK } # evaluation // back into the real world : idx -> words print "" #print "true y", train_y[-1] #y_pred = rnn.classify(np.array(train_x_win[-1]).reshape((1, len(train_x_win[-1]))), # train_masks[-1], params['win'], extra_input_dims, train_extra[-1])[0] #print "pred y", y_pred #if params['pooling_method'] == 'attention1' or params['pooling_method'] == 'attention2': # if extra_input_dims == 0: # r = np.random.randint(0, len(train_lex)) # print r, rnn.a_sum_check(np.asarray(contextwin(train_lex[r], params['win'])).astype('int32')) predictions_train = predict(n_train, params['classify_minibatch_size'], train_x_win, train_masks, train_y, params['win'], extra_input_dims, train_extra, rnn, order) n_valid = len(valid_lex) n_test = len(test_lex) predictions_valid = predict(n_valid, params['classify_minibatch_size'], valid_x_win, valid_masks, valid_y, params['win'], extra_input_dims, dev_extra, rnn) predictions_test = predict(n_test, params['classify_minibatch_size'], test_x_win, test_masks, test_y, params['win'], extra_input_dims, test_extra, rnn) """ predictions_train = [rnn.classify(x, train_masks[i], params['win'], extra_input_dims, train_extra[i])[0] for i, x in enumerate(train_lex)] predictions_valid = [rnn.classify(x, valid_masks[i], params['win'], extra_input_dims, dev_extra[i])[0] for i, x in enumerate(valid_lex)] predictions_test = [rnn.classify(x, test_masks[i], params['win'], extra_input_dims, test_extra[i])[0] for i, x in enumerate(test_lex)] """ train_f1 = common.calc_mean_f1(predictions_train, train_y) test_f1 = common.calc_mean_f1(predictions_test, test_y) valid_f1 = common.calc_mean_f1(predictions_valid, valid_y) question_f1s = [] question_pps = [] print "train_f1 =", train_f1, "valid_f1 =", valid_f1, "test_f1 =", test_f1 if valid_f1 > best_f1: best_rnn = copy.deepcopy(rnn) best_f1 = valid_f1 best_test_predictions = predictions_test if params['verbose']: print('NEW BEST: epoch', e, 'valid f1', valid_f1, 'best test f1', test_f1) params['tr_f1'] = train_f1 params['te_f1'] = test_f1 params['v_f1'] = valid_f1 params['be'] = e # store the current epoch as a new best # learning rate decay if no improvement in a given number of epochs if abs(params['be']-params['ce']) >= params['decay_delay']: params['clr'] *= params['decay_factor'] params['be'] = params['ce'] print "Reverting to current best; new learning rate = ", params['clr'] # also reset to the previous best rnn = best_rnn if params['clr'] < 1e-5: break if best_f1 == 1.0: break if best_f1 == 0 and e > 7: break if params['save_model']: predictions_test = predict(len(test_y), params['classify_minibatch_size'], test_x_win, test_masks, test_y, params['win'], extra_input_dims, test_extra, best_rnn) best_rnn.save(output_dir) common.write_predictions(datasets, params['test_fold'], dev_fold, predictions_test, test_items, output_dir) print('BEST RESULT: epoch', params['be'], 'train F1 ', params['tr_f1'], 'valid F1', params['v_f1'], 'best test F1', params['te_f1'], 'with the model', output_dir) best_true_valid_f1s.append(params['v_f1']) best_test_f1s.append(params['te_f1']) best_train_f1s.append(params['tr_f1']) if reuser is not None: best_valid_f1 = reuser.mask_value(params['v_f1'], params['tr_f1']) else: best_valid_f1 = params['v_f1'] best_valid_f1s.append(best_valid_f1) test_prediction_arrays.append(np.array(best_test_predictions, dtype=int)) params['ensemble'] = False if params['ensemble']: test_predictions_stack = np.dstack(test_prediction_arrays) final_predictions = stats.mode(test_predictions_stack, axis=2)[0][:, :, 0] predicted_df = pd.DataFrame(final_predictions, index=test_items, columns=codes) true_df = pd.DataFrame(np.array(test_y), index=test_items, columns=codes) final_test_f1, final_test_pp = evaluation.calc_macro_mean_f1_pp(true_df, predicted_df) else: final_test_f1 = np.median(best_test_f1s) return {'loss': -np.median(best_valid_f1s), 'final_test_f1': final_test_f1, 'valid_f1s': best_valid_f1s, 'train_f1s': best_train_f1s, 'true_valid_f1s': best_true_valid_f1s, 'test_f1s': best_test_f1s, 'status': STATUS_OK }
def main(params=None): if params is None: params = { 'dataset': 'DRLD', 'exp_name': 'best_minibatch_mod', 'test_fold': 0, 'n_dev_folds': 1, 'min_doc_thresh': 1, 'initialize_word_vectors': False, 'vectors': 'anes_word2vec_300', # default_word2vec_300, anes_word2vec_300, chars_word2vec_25, eye_1 ... 'init_scale': 0.2, 'add_OOV_dim': False, 'win': 1, # size of context window 'add_DRLD': False, 'rnn_type': 'LSTM', # basic, GRU, or LSTM 'n_hidden': 50, # size of hidden units 'pooling_method': 'last', # max, mean, or attention1/2 'bidirectional': False, 'bi_combine': 'concat', # concat, max, or mean 'train_embeddings': False, 'lr': 0.025, # learning rate 'lr_emb_fac': 0.2, # factor to modify learning rate for embeddings 'decay_delay': 5, # number of epochs with no improvement before decreasing learning rate 'decay_factor': 0.5, # factor by which to multiply learning rate in case of delay 'n_epochs': 100, 'add_OOV_noise': False, 'OOV_noise_prob': 0.01, 'minibatch_size': 1, 'classify_minibatch_size': 1, 'ensemble': False, 'save_model': True, 'seed': 42, 'verbose': 1, 'reuse': False, 'orig_T': 0.04, 'tau': 0.01, 'xavier_init': True } params = fh.read_json('/Users/dcard/Projects/CMU/ARK/guac/experiments/rnn/bayes_opt_rnn_LSTM_reuse_mod_34_rerun/params.txt') params['n_hidden'] = int(params['n_hidden']) keys = params.keys() keys.sort() for key in keys: print key, ':', params[key] # seed the random number generators np.random.seed(params['seed']) random.seed(params['seed']) vector_type = params['vectors'].split('_')[0] params['word2vec_dim'] = int(params['vectors'].split('_')[-1]) reuser = None if params['reuse']: reuser = reusable_holdout.ReuseableHoldout(T=params['orig_T'], tau=params['tau']) if params['dataset'] == 'DRLD': datasets = ['Democrat-Likes', 'Democrat-Dislikes', 'Republican-Likes', 'Republican-Dislikes'] elif params['dataset'] == 'MIP': datasets = ['MIP-Personal-1', 'MIP-Personal-2', 'MIP-Political-1', 'MIP-Political-2'] elif params['dataset'] == 'MOLD': datasets = ['McCain-Likes', 'McCain-Dislikes', 'Obama-Likes', 'Obama-Dislikes'] elif params['dataset'] == 'Primary': datasets = ['Obama-Primary', 'Clinton-Primary'] elif params['dataset'] == 'General': datasets = ['Obama-General', 'McCain-General'] else: datasets = [params['dataset']] np.random.seed(params['seed']) random.seed(params['seed']) best_valid_f1s = [] best_true_valid_f1s = [] best_test_f1s = [] best_train_f1s = [] test_prediction_arrays = [] output_dir = fh.makedirs(defines.exp_dir, 'rnn', params['exp_name']) output_filename = fh.make_filename(output_dir, 'params', 'txt') fh.write_to_json(params, output_filename) for dev_fold in range(params['n_dev_folds']): print "dev fold =", dev_fold output_dir = fh.makedirs(defines.exp_dir, 'rnn', params['exp_name'], 'fold' + str(dev_fold)) all_data, words2idx, items, all_labels = common.load_data(datasets, params['test_fold'], dev_fold, params['min_doc_thresh']) train_xy, valid_xy, test_xy = all_data train_lex, train_y = train_xy valid_lex, valid_y = valid_xy test_lex, test_y = test_xy train_lengths = [len(x) for x in train_lex] length_order = np.argsort(train_lengths) #if params['minibatch_size'] > 1 or params['classify_minibatch_size'] > 1: print "padding input with zeros" #all_data, all_masks = common.prepare_data(train_lex, valid_lex, test_lex, preset_max=100) all_data, all_masks = common.prepare_data(train_lex, valid_lex, test_lex) train_lex, valid_lex, test_lex = all_data train_masks, valid_masks, test_masks = all_masks #else: # train_masks = [np.ones(len(x)).astype('int32') for x in train_lex] # valid_masks = [np.ones(len(x)).astype('int32') for x in valid_lex] # test_masks = [np.ones(len(x)).astype('int32') for x in test_lex] print "expanding x with context win dows" # Rejigger to convert x to contex win in advance train_x_win = expand_x_with_context_win(train_lex, params['win']) valid_x_win = expand_x_with_context_win(valid_lex, params['win']) test_x_win = expand_x_with_context_win(test_lex, params['win']) order = range(len(train_lex)) print "done" train_items, dev_items, test_items = items vocsize = len(words2idx.keys()) idx2words = dict((k, v) for v, k in words2idx.iteritems()) best_test_predictions = None n_sentences = len(train_lex) print "vocsize = ", vocsize, 'n_train', n_sentences codes = all_labels.columns n_items, n_codes = all_labels.shape # get the words in the sentences for the test and validation sets words_valid = [map(lambda x: idx2words[x], w) for w in valid_lex] groundtruth_test = test_y[:] words_test = [map(lambda x: idx2words[x], w) for w in test_lex] #if vector_type == 'eye': # initial_embeddings = np.eye(vocsize) # emb_dim = initial_embeddings.shape[1] if params['initialize_word_vectors']: initial_embeddings = common.load_embeddings(params, words2idx) emb_dim = initial_embeddings.shape[1] else: initial_embeddings = None emb_dim = params['word2vec_dim'] print "embedding dim =", emb_dim extra_input_dims = 0 if params['add_DRLD']: #extra_input_dims = 4 extra_input_dims = 2 print "Building RNN" rnn = RNN(nh=params['n_hidden'], nc=n_codes, ne=vocsize, de=emb_dim, cs=params['win'], extra_input_dims=extra_input_dims, initial_embeddings=initial_embeddings, init_scale=params['init_scale'], rnn_type=params['rnn_type'], train_embeddings=params['train_embeddings'], pooling_method=params['pooling_method'], bidirectional=params['bidirectional'], bi_combine=params['bi_combine'], xavier_init=params['xavier_init'] ) # add extra dimensions to differentiate between paired datasets train_likes = [1 if re.search('Likes', i) else 0 for i in train_items] dev_likes = [1 if re.search('Likes', i) else 0 for i in dev_items] test_likes = [1 if re.search('Likes', i) else 0 for i in test_items] train_dem = [1 if re.search('Democrat', i) else 0 for i in train_items] dev_dem = [1 if re.search('Democrat', i) else 0 for i in dev_items] test_dem = [1 if re.search('Democrat', i) else 0 for i in test_items] """ train_obama = [1 if re.search('Obama', i) else 0 for i in train_items] dev_obama = [1 if re.search('Obama', i) else 0 for i in dev_items] test_obama = [1 if re.search('Obama', i) else 0 for i in test_items] train_personal = [1 if re.search('Personal', i) else 0 for i in train_items] dev_personal = [1 if re.search('Personal', i) else 0 for i in dev_items] test_personal = [1 if re.search('Personal', i) else 0 for i in test_items] train_extra = [[train_likes[i], train_dem[i], train_obama[i], train_personal[i]] for i, t in enumerate(train_items)] dev_extra = [[dev_likes[i], dev_dem[i], dev_obama[i], dev_personal[i]] for i, t in enumerate(dev_items)] test_extra = [[test_likes[i], test_dem[i], test_obama[i], test_personal[i]] for i, t in enumerate(test_items)] """ train_extra = [[train_likes[i], train_dem[i]] for i, t in enumerate(train_items)] dev_extra = [[dev_likes[i], dev_dem[i]] for i, t in enumerate(dev_items)] test_extra = [[test_likes[i], test_dem[i]] for i, t in enumerate(test_items)] ### LOAD rnn.load(output_dir) # train with early stopping on validation set best_f1 = -np.inf params['clr'] = params['lr'] n_train = len(order) predictions_train = predict(n_train, params['classify_minibatch_size'], train_x_win, train_masks, train_y, params['win'], extra_input_dims, train_extra, rnn, order) n_valid = len(valid_lex) n_test = len(test_lex) predictions_valid = predict(n_valid, params['classify_minibatch_size'], valid_x_win, valid_masks, valid_y, params['win'], extra_input_dims, dev_extra, rnn) predictions_test = predict(n_test, params['classify_minibatch_size'], test_x_win, test_masks, test_y, params['win'], extra_input_dims, test_extra, rnn) """ predictions_train = [rnn.classify(x, train_masks[i], params['win'], extra_input_dims, train_extra[i])[0] for i, x in enumerate(train_lex)] predictions_valid = [rnn.classify(x, valid_masks[i], params['win'], extra_input_dims, dev_extra[i])[0] for i, x in enumerate(valid_lex)] predictions_test = [rnn.classify(x, test_masks[i], params['win'], extra_input_dims, test_extra[i])[0] for i, x in enumerate(test_lex)] """ train_f1 = common.calc_mean_f1(predictions_train, train_y) test_f1 = common.calc_mean_f1(predictions_test, test_y) valid_f1 = common.calc_mean_f1(predictions_valid, valid_y) output_dir = fh.makedirs(output_dir, 'responses') ms = 1 for i in range(n_train): mb_x, mb_masks, mb_extra, mb_y = select_minibatch(train_x_win, train_masks, train_extra, train_y, params['win'], i, ms, order=range(len(train_y))) h, W, b, p_y, s, i_f, i_r, \ f_f, f_r, o_f, o_r, c = rnn.step_through(mb_x, mb_masks, params['win'], extra_input_dims, mb_extra) temp = np.dot(h, W) + b s = 1.0/(1.0 + np.exp(-temp)) output_filename = fh.make_filename(output_dir, train_items[i], 'csv') np.savetxt(output_filename, s[:, 0, :], delimiter=',') output_npy_files(output_dir, train_items[i], i_f, i_r, f_f, f_r, o_f, o_r, h, c) for i in range(n_valid): mb_x, mb_masks, mb_extra, mb_y = select_minibatch(valid_x_win, valid_masks, dev_extra, valid_y, params['win'], i, ms, order=range(len(valid_y))) h, W, b, p_y, s, i_f, i_r, \ f_f, f_r, o_f, o_r, c = rnn.step_through(mb_x, mb_masks, params['win'], extra_input_dims, mb_extra) temp = np.dot(h, W) + b s = 1.0/(1.0 + np.exp(-temp)) output_filename = fh.make_filename(output_dir, dev_items[i], 'csv') np.savetxt(output_filename, s[:, 0, :], delimiter=',') output_npy_files(output_dir, dev_items[i], i_f, i_r, f_f, f_r, o_f, o_r, h, c) for i in range(n_test): mb_x, mb_masks, mb_extra, mb_y = select_minibatch(test_x_win, test_masks, test_extra, test_y, params['win'], i, ms, order=range(len(test_y))) h, W, b, p_y, s, i_f, i_r,\ f_f, f_r, o_f, o_r, c = rnn.step_through(mb_x, mb_masks, params['win'], extra_input_dims, mb_extra) temp = np.dot(h, W) + b s = 1.0/(1.0 + np.exp(-temp)) output_filename = fh.make_filename(output_dir, test_items[i], 'csv') np.savetxt(output_filename, s[:, 0, :], delimiter=',') output_npy_files(output_dir, test_items[i], i_f, i_r, f_f, f_r, o_f, o_r, h, c) print "train_f1 =", train_f1, "valid_f1 =", valid_f1, "test_f1 =", test_f1
def main(params=None): if params is None: params = { "exp_name": "car_test", "test_fold": 0, "n_dev_folds": 1, "min_doc_thresh": 1, "initialize_word_vectors": True, "vectors": "char", # default_word2vec, anes_word2vec_300 ... "word2vec_dim": 93, "init_scale": 0.2, "add_OOV": False, "win": 1, # size of context window "add_DRLD": False, "rnn_type": "basic", # basic, GRU, or LSTM "n_hidden": 93, # size of hidden units "pooling_method": "max", # max, mean, or attention1/2 "bidirectional": True, "bi_combine": "concat", # concat, max, or mean "train_embeddings": True, "lr": 0.1, # learning rate "lr_emb_fac": 0.2, # factor to modify learning rate for embeddings "decay_delay": 5, # number of epochs with no improvement before decreasing learning rate "decay_factor": 0.5, # factor by which to multiply learning rate in case of delay "n_epochs": 100, "add_OOV_noise": False, "OOV_noise_prob": 0.00, "minibatch_size": 1, "classify_minibatch_size": 1, "ensemble": False, "save_model": True, "seed": 42, "verbose": 1, "reuse": False, "orig_T": 0.04, "tau": 0.01, } # params = fh.read_json('/Users/dcard/Projects/CMU/ARK/guac/experiments/best_mod.json') # params['exp_name'] += '_minibatch_16' # params['n_hidden'] = int(params['n_hidden']) # params['orig_T'] = 0.02 # params['tau'] = 0.005 keys = params.keys() keys.sort() for key in keys: print key, ":", params[key] # seed the random number generators np.random.seed(params["seed"]) random.seed(params["seed"]) reuser = None if params["reuse"]: reuser = reusable_holdout.ReuseableHoldout(T=params["orig_T"], tau=params["tau"]) datasets = ["Democrat-Likes", "Democrat-Dislikes", "Republican-Likes", "Republican-Dislikes"] np.random.seed(params["seed"]) random.seed(params["seed"]) best_valid_f1s = [] best_true_valid_f1s = [] best_test_f1s = [] test_prediction_arrays = [] output_dir = fh.makedirs(defines.exp_dir, "rnn", params["exp_name"]) output_filename = fh.make_filename(output_dir, "params", "txt") fh.write_to_json(params, output_filename) for dev_fold in range(params["n_dev_folds"]): print "dev fold =", dev_fold output_dir = fh.makedirs(defines.exp_dir, "rnn", params["exp_name"], "fold" + str(dev_fold)) all_data, words2idx, items, all_labels = common.load_char_data(datasets, params["test_fold"], dev_fold) train_xy, valid_xy, test_xy = all_data train_lex, train_y = train_xy valid_lex, valid_y = valid_xy test_lex, test_y = test_xy # if params['minibatch_size'] > 1 or params['classify_minibatch_size'] > 1: print "padding input with zeros" all_data, all_masks = common.prepare_data(train_lex, valid_lex, test_lex) train_lex, valid_lex, test_lex = all_data train_masks, valid_masks, test_masks = all_masks # else: # train_masks = [np.ones(len(x)).astype('int32') for x in train_lex] # valid_masks = [np.ones(len(x)).astype('int32') for x in valid_lex] # test_masks = [np.ones(len(x)).astype('int32') for x in test_lex] print "expanding x with context win dows" # Rejigger to convert x to contex win in advance train_x_win = expand_x_with_context_win(train_lex, params["win"]) valid_x_win = expand_x_with_context_win(valid_lex, params["win"]) test_x_win = expand_x_with_context_win(test_lex, params["win"]) order = range(len(train_lex)) print "done" train_items, dev_items, test_items = items vocsize = len(words2idx.keys()) idx2words = dict((k, v) for v, k in words2idx.iteritems()) best_test_predictions = None n_sentences = len(train_lex) print "vocsize = ", vocsize, "n_train", n_sentences codes = all_labels.columns n_items, n_codes = all_labels.shape # get the words in the sentences for the test and validation sets words_valid = [map(lambda x: idx2words[x], w) for w in valid_lex] groundtruth_test = test_y[:] words_test = [map(lambda x: idx2words[x], w) for w in test_lex] if params["initialize_word_vectors"]: initial_embeddings = np.eye(vocsize) emb_dim = initial_embeddings.shape[1] print "emb_dim =", emb_dim else: initial_embeddings = None emb_dim = params["word2vec_dim"] extra_input_dims = 0 if params["add_DRLD"]: extra_input_dims = 2 print "Building RNN" rnn = RNN( nh=params["n_hidden"], nc=n_codes, ne=vocsize, de=emb_dim, cs=params["win"], extra_input_dims=extra_input_dims, initial_embeddings=initial_embeddings, init_scale=params["init_scale"], rnn_type=params["rnn_type"], train_embeddings=params["train_embeddings"], pooling_method=params["pooling_method"], bidirectional=params["bidirectional"], bi_combine=params["bi_combine"], ) train_likes = [1 if re.search("Likes", i) else 0 for i in train_items] dev_likes = [1 if re.search("Likes", i) else 0 for i in dev_items] test_likes = [1 if re.search("Likes", i) else 0 for i in test_items] train_dem = [1 if re.search("Democrat", i) else 0 for i in train_items] dev_dem = [1 if re.search("Democrat", i) else 0 for i in dev_items] test_dem = [1 if re.search("Democrat", i) else 0 for i in test_items] train_extra = [[train_likes[i], train_dem[i]] for i, t in enumerate(train_items)] dev_extra = [[dev_likes[i], dev_dem[i]] for i, t in enumerate(dev_items)] test_extra = [[test_likes[i], test_dem[i]] for i, t in enumerate(test_items)] ### LOAD # rnn.load(output_dir) # train with early stopping on validation set best_f1 = -np.inf params["clr"] = params["lr"] for e in xrange(params["n_epochs"]): # shuffle # shuffle([train_lex, train_y, train_extra, train_masks], params['seed']) # shuffle the input data shuffle([order, train_lex, train_y, train_extra, train_masks], params["seed"]) # shuffle the input data params["ce"] = e # store the current epoch tic = timeit.default_timer() ms = params["minibatch_size"] n_train = len(train_lex) # for i, orig_x in enumerate(train_lex): for iteration, i in enumerate(range(0, n_train, ms)): # orig_x = train_lex[i] # n_words = len(orig_x) # if params['add_OOV_noise']: # draws = np.random.rand(n_words) # x = [OOV_index if draws[i] < params['OOV_noise_prob'] else orig_x[i] for i in range(n_words)] # else: # x = orig_x # y = train_y[i] extra = train_extra[i] # mask = train_masks[i] minibatch_x, minibatch_mask, minibatch_extra, minibatch_y = select_minibatch( train_x_win, train_masks, train_extra, train_y, params["win"], i, ms, order, params["add_OOV_noise"], params["OOV_noise_prob"], ) # if i == 0: # print '\n'.join([' '.join([idx2words[idx] for idx in minibatch_x[:, k, 0].tolist()]) for # k in range(ms)]) nll, a_sum = rnn.train( minibatch_x, minibatch_mask, minibatch_y, params["win"], params["clr"], params["lr_emb_fac"], extra_input_dims, minibatch_extra, ) # rnn.train(x, mask, y, params['win'], params['clr'], params['lr_emb_fac'], # extra_input_dims, extra) print "[learning] epoch %i >> %2.2f%%" % (e, (i + 1) * 100.0 / float(n_sentences)), print "completed in %.2f (sec), nll = %.2f, a_sum = %.1f <<\r" % ( timeit.default_timer() - tic, nll, np.max(a_sum), ), sys.stdout.flush() if np.isnan(nll) or np.isinf(nll): return { "loss": nll, "final_test_f1": 0, "valid_f1s": 0, "true_valid_f1s": 0, "test_f1s": 0, "train_f1s": 0, "status": STATUS_OK, } # evaluation // back into the real world : idx -> words print "" # print "true y", train_y[-1] # y_pred = rnn.classify(np.array(train_x_win[-1]).reshape((1, len(train_x_win[-1]))), # train_masks[-1], params['win'], extra_input_dims, train_extra[-1])[0] # print "pred y", y_pred # if params['pooling_method'] == 'attention1' or params['pooling_method'] == 'attention2': # if extra_input_dims == 0: # r = np.random.randint(0, len(train_lex)) # print r, rnn.a_sum_check(np.asarray(contextwin(train_lex[r], params['win'])).astype('int32')) predictions_train = predict( n_train, params["classify_minibatch_size"], train_x_win, train_masks, train_y, params["win"], extra_input_dims, train_extra, rnn, order, ) n_valid = len(valid_lex) n_test = len(test_lex) predictions_valid = predict( n_valid, params["classify_minibatch_size"], valid_x_win, valid_masks, valid_y, params["win"], extra_input_dims, dev_extra, rnn, ) predictions_test = predict( n_test, params["classify_minibatch_size"], test_x_win, test_masks, test_y, params["win"], extra_input_dims, test_extra, rnn, ) """ predictions_train = [rnn.classify(x, train_masks[i], params['win'], extra_input_dims, train_extra[i])[0] for i, x in enumerate(train_lex)] predictions_valid = [rnn.classify(x, valid_masks[i], params['win'], extra_input_dims, dev_extra[i])[0] for i, x in enumerate(valid_lex)] predictions_test = [rnn.classify(x, test_masks[i], params['win'], extra_input_dims, test_extra[i])[0] for i, x in enumerate(test_lex)] """ train_f1 = common.calc_mean_f1(predictions_train, train_y) test_f1 = common.calc_mean_f1(predictions_test, test_y) valid_f1 = common.calc_mean_f1(predictions_valid, valid_y) question_f1s = [] question_pps = [] print "train_f1 =", train_f1, "valid_f1 =", valid_f1, "test_f1 =", test_f1 if valid_f1 > best_f1: best_rnn = copy.deepcopy(rnn) best_f1 = valid_f1 best_test_predictions = predictions_test if params["verbose"]: print ("NEW BEST: epoch", e, "valid f1", valid_f1, "best test f1", test_f1) params["tr_f1"] = train_f1 params["te_f1"] = test_f1 params["v_f1"] = valid_f1 params["be"] = e # store the current epoch as a new best # learning rate decay if no improvement in a given number of epochs if abs(params["be"] - params["ce"]) >= params["decay_delay"]: params["clr"] *= params["decay_factor"] params["be"] = params["ce"] print "Reverting to current best; new learning rate = ", params["clr"] # also reset to the previous best rnn = best_rnn if params["clr"] < 1e-5: break if best_f1 == 1.0: break if best_f1 == 0 and e > 6: break if params["save_model"]: predictions_valid = predict( len(valid_y), params["classify_minibatch_size"], valid_x_win, valid_masks, valid_y, params["win"], extra_input_dims, dev_extra, rnn, ) # predictions_valid = [best_rnn.classify(np.asarray(contextwin(x, params['win'])).astype('int32')) for x in valid_lex] best_rnn.save(output_dir) common.write_predictions(datasets, params["test_fold"], dev_fold, predictions_valid, dev_items, output_dir) print ( "BEST RESULT: epoch", params["be"], "train F1 ", params["tr_f1"], "valid F1", params["v_f1"], "best test F1", params["te_f1"], "with the model", output_dir, ) best_true_valid_f1s.append(params["v_f1"]) best_test_f1s.append(params["te_f1"]) if reuser is not None: best_valid_f1 = reuser.mask_value(params["v_f1"], params["tr_f1"]) else: best_valid_f1 = params["v_f1"] best_valid_f1s.append(best_valid_f1) test_prediction_arrays.append(np.array(best_test_predictions, dtype=int)) params["ensemble"] = False if params["ensemble"]: test_predictions_stack = np.dstack(test_prediction_arrays) final_predictions = stats.mode(test_predictions_stack, axis=2)[0][:, :, 0] predicted_df = pd.DataFrame(final_predictions, index=test_items, columns=codes) true_df = pd.DataFrame(np.array(test_y), index=test_items, columns=codes) final_test_f1, final_test_pp = evaluation.calc_macro_mean_f1_pp(true_df, predicted_df) else: final_test_f1 = np.median(best_test_f1s) return { "loss": -np.median(best_valid_f1s), "final_test_f1": final_test_f1, "valid_f1s": best_valid_f1s, "true_valid_f1s": best_true_valid_f1s, "test_f1s": best_test_f1s, "status": STATUS_OK, }
import tensorflow as tf import sys import time import cv2 import numpy as np import common import config import model images, labels = common.prepare_data('training_set.csv') images = np.array(images, dtype=np.float32) / 255 labels = np.array(labels, dtype=np.float32) index = np.random.permutation(len(images)) images = images[index].reshape(-1, config.hauteur, config.largeur, 1) labels = labels[index] print("Nbr images:", len(images)) train_ds = tf.data.Dataset.from_tensor_slices( (images, labels)).batch(config.batch_size) del images del labels def my_loss(labels, preds): lambda_xy = 5 lambda_Aa = 5 lambda_angle = 1 preds_xy = preds[:, 0:2]
""" Yield batch data. Note that smaller batch than batch_size is ignored """ # iterator start = 0 num_ex = images.shape[0] while True: end = start + batch_size if end > num_ex: break yield images[start:end], labels[start:end] start = end model_dir = "./model_dir" train_data, test_data = prepare_data("mnist.data.pkl") batch_size = 100 model = MnistModel(batch_size) saver = tf.train.Saver() sv = tf.train.Supervisor(logdir=model_dir, saver=saver) with sv.managed_session(config=tf.ConfigProto()) as sess: images, labels = train_data epoch = 0 while not sv.should_stop(): fetches = [model.loss, model.train_op] for batch_images, batch_labels in next_batch(images, labels, batch_size):
from tensorflow import keras from algorithms import logical_regression, dense_neural_network, convolutional_neural_network from common import prepare_data, get_mistakes, visualize_predictions if __name__ == '__main__': x_train, y_train, x_valid, y_valid, x_test, y_test = prepare_data( train_csv="fashion-mnist_train.csv", test_csv="fashion-mnist_test.csv") # Register Tensorboard callbacks LR_callback = keras.callbacks.TensorBoard(log_dir="logs\\LR") DNN_callback = keras.callbacks.TensorBoard(log_dir="logs\\DNN") CNN_callback = keras.callbacks.TensorBoard(log_dir="logs\\CNN") BN_callback = keras.callbacks.TensorBoard(log_dir="logs\\BN") # Train different models, using train and validation sets log_reg = logical_regression(x_train=x_train, x_valid=x_valid, y_train=y_train, y_valid=y_valid, callback=LR_callback) dense_nn = dense_neural_network(x_train=x_train, x_valid=x_valid, y_train=y_train, y_valid=y_valid, callback=DNN_callback) conv_nn = convolutional_neural_network(x_train=x_train, x_valid=x_valid, y_train=y_train, y_valid=y_valid, callback=CNN_callback) bn_conv_nn = convolutional_neural_network(x_train=x_train, x_valid=x_valid,