def main(params): dp = DataProvider(params) # Create vocabulary and author index if params['resume'] == None: if params['atoms'] == 'char': char_to_ix, ix_to_char = dp.createCharVocab( params['vocab_threshold']) else: char_to_ix, ix_to_char = dp.createWordVocab( params['vocab_threshold']) auth_to_ix, ix_to_auth = dp.createAuthorIdx() else: saved_model = torch.load(params['resume']) char_to_ix = saved_model['char_to_ix'] auth_to_ix = saved_model['auth_to_ix'] ix_to_auth = saved_model['ix_to_auth'] ix_to_char = saved_model['ix_to_char'] params['vocabulary_size'] = len(char_to_ix) params['num_output_layers'] = len(auth_to_ix) model = CharTranslator(params) # set to train mode, this activates dropout model.train() #Initialize the RMSprop optimizer if params['use_sgd']: optim = torch.optim.SGD(model.parameters(), lr=params['learning_rate'], momentum=params['decay_rate']) else: optim = torch.optim.RMSprop(model.parameters(), lr=params['learning_rate'], alpha=params['decay_rate'], eps=params['smooth_eps']) # Loss function if params['mode'] == 'generative': criterion = nn.CrossEntropyLoss() else: criterion = nn.NLLLoss() # Restore saved checkpoint if params['resume'] != None: model.load_state_dict(saved_model['state_dict']) optim.load_state_dict(saved_model['optimizer']) total_loss = 0. start_time = time.time() hidden = model.init_hidden(params['batch_size']) hidden_zeros = model.init_hidden(params['batch_size']) # Initialize the cache if params['randomize_batches']: dp.set_hid_cache(range(len(dp.data['docs'])), hidden_zeros) # Compute the iteration parameters epochs = params['max_epochs'] total_seqs = dp.get_num_sents(split='train') iter_per_epoch = total_seqs // params['batch_size'] total_iters = iter_per_epoch * epochs best_loss = 1000000. best_val = 1000. eval_every = int(iter_per_epoch * params['eval_interval']) #val_score = eval_model(dp, model, params, char_to_ix, auth_to_ix, split='val', max_docs = params['num_eval']) val_score = 0. #eval_model(dp, model, params, char_to_ix, auth_to_ix, split='val', max_docs = params['num_eval']) val_rank = 1000 eval_function = eval_translator if params[ 'mode'] == 'generative' else eval_classify leakage = 0. #params['leakage'] print total_iters for i in xrange(total_iters): #TODO if params['split_generators']: c_aid = ix_to_auth[np.random.choice(auth_to_ix.values())] else: c_aid = None batch = dp.get_sentence_batch(params['batch_size'], split='train', atoms=params['atoms'], aid=c_aid, sample_by_len=params['sample_by_len']) inps, targs, auths, lens = dp.prepare_data( batch, char_to_ix, auth_to_ix, maxlen=params['max_seq_len']) # Reset the hidden states for which new docs have been sampled # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) optim.zero_grad() #TODO if params['mode'] == 'generative': output, _ = model.forward_mltrain(inps, lens, inps, lens, hidden_zeros, auths=auths) targets = pack_padded_sequence(Variable(targs).cuda(), lens) loss = criterion(pack_padded_sequence(output, lens)[0], targets[0]) else: # for classifier auths is the target output, hidden = model.forward_classify(inps, hidden, compute_softmax=True) targets = Variable(auths).cuda() loss = criterion(output, targets) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm(model.parameters(), params['grad_clip']) # Take an optimization step optim.step() total_loss += loss.data.cpu().numpy()[0] # Save the hidden states in cache for later use if i % eval_every == 0 and i > 0: val_rank, val_score = eval_function(dp, model, params, char_to_ix, auth_to_ix, split='val') #if i % iter_per_epoch == 0 and i > 0 and leakage > params['leakage_min']: # leakage = leakage * params['leakage_decay'] #if (i % iter_per_epoch == 0) and ((i//iter_per_epoch) >= params['lr_decay_st']): if i % params['log_interval'] == 0 and i > 0: cur_loss = total_loss / params['log_interval'] elapsed = time.time() - start_time print( '| epoch {:2.2f} | {:5d}/{:5d} batches | lr {:02.2e} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( float(i) / iter_per_epoch, i, total_iters, params['learning_rate'], elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0. if val_rank <= best_val: save_checkpoint( { 'iter': i, 'arch': params, 'val_loss': val_rank, 'val_pplx': val_score, 'char_to_ix': char_to_ix, 'ix_to_char': ix_to_char, 'auth_to_ix': auth_to_ix, 'ix_to_auth': ix_to_auth, 'state_dict': model.state_dict(), 'loss': cur_loss, 'optimizer': optim.state_dict(), }, fappend=params['fappend'], outdir=params['checkpoint_output_directory']) best_val = val_rank start_time = time.time()
def main(params): dp = DataProvider(params) auth_to_ix = dp.createAuthorIdx() # Preprocess the training data train_docs = [] targets = [] model = {} # remove numbers bad_hombres = range(10) if params['nostop']: bad_hombres = bad_hombres + stopwords.words('english') if params['nopunct']: bad_hombres = bad_hombres + list(string.punctuation) bad_hombres = set(bad_hombres) all_words = Counter() for i, doc in enumerate(dp.data['docs']): no_num = re.sub(r'\d+', '', doc['text'].lower()) curr_text = [ w for w in wordpunct_tokenize(no_num) if w not in bad_hombres ] dp.data['docs'][i]['tokenized'] = curr_text if doc['split'] == 'train': all_words.update(curr_text) short_vocab = { w: i for i, w in enumerate([ wrd for wrd in all_words if all_words[wrd] > params['vocab_threshold'] ]) } docCounts_train, target_train = count(dp, short_vocab, auth_to_ix, split='train') bow_features_train, idf_train = bow_features(docCounts_train, params['tfidf']) docCounts_val, target_val = count(dp, short_vocab, auth_to_ix, split='val') bow_features_val, _ = bow_features(docCounts_val, params['tfidf'], idf=idf_train) # Do PCA? if params['pca'] > 0: pca_model = PCA(n_components=params['pca']) bow_features_train = pca_model.fit_transform(bow_features_train) print 'Explained variance is %.2f' % (sum( pca_model.explained_variance_ratio_)) bow_features_val = pca_model.transform(bow_features_val) params['pca'] = bow_features_train.shape[-1] # Normalize the data bow_features_train, mean_tr, std_tr = normalize(bow_features_train) bow_features_val, _, _ = normalize(bow_features_val, mean_tr, std_tr) if params['mlp'] == False: if params['linearsvm']: # Linear SVC alread implements one-vs-rest svm_model = LinearSVC() #verbose=1) svm_model.fit(bow_features_train, target_train) #Time to evaluate now. confTr = svm_model.decision_function(bow_features_train) confVal = svm_model.decision_function(bow_features_val) else: params['num_output_layers'] = len(auth_to_ix) params['inp_size'] = params['pca'] model = MLP_classifier(params) model.fit(bow_features_train, target_train, bow_features_val, target_val, params['epochs'], params['lr'], params['l2']) confTr = model.decision_function(bow_features_train) confVal = model.decision_function(bow_features_val) mean_rank_train = np.where( confTr.argsort(axis=1)[:, ::-1] == target_train[:, None])[1].mean() topk_train = ( np.where(confTr.argsort(axis=1)[:, ::-1] == target_train[:, None])[1] <= params['topk']).sum() * 100. / len(target_train) train_accuracy = 100. * float( (confTr.argmax(axis=1) == target_train).sum()) / len(target_train) mean_rank_val = np.where( confVal.argsort(axis=1)[:, ::-1] == target_val[:, None])[1].mean() topk_val = ( np.where(confVal.argsort(axis=1)[:, ::-1] == target_val[:, None])[1] <= params['topk']).sum() * 100. / len(target_val) val_accuracy = 100. * float( (confVal.argmax(axis=1) == target_val).sum()) / len(target_val) # DO the binary evaluation similar to the Bagnall #confTr = confTr - confTr.mean(axis=1)[:,None] n_auths = len(auth_to_ix) n_train = confTr.shape[0] neg_auths_tr = np.random.randint(0, n_auths, n_train) adjusted_scores_tr = ((np.argsort( confTr[:, np.concatenate([target_train.astype(int), neg_auths_tr])], axis=0) == np.concatenate([np.arange(n_train), np.arange(n_train)])).argmax(axis=0) + 1) / float(n_train) auc_tr = roc_auc_score( np.concatenate([ np.ones(int(n_train), dtype=int), np.zeros(int(n_train), dtype=int) ]), adjusted_scores_tr) n_val = confVal.shape[0] neg_auths_val = np.random.randint(0, n_auths, n_val) adjusted_scores_val = ((np.argsort( confVal[:, np.concatenate([target_val.astype(int), neg_auths_val])], axis=0) == np.concatenate([np.arange(n_val), np.arange(n_val)])).argmax(axis=0) + 1) / float(n_val) auc_val = roc_auc_score( np.concatenate( [np.ones(int(n_val), dtype=int), np.zeros(int(n_val), dtype=int)]), adjusted_scores_val) print '------------- Training set-------------------' print 'Accuracy is %.2f, Mean rank is %.2f / %d' % ( train_accuracy, mean_rank_train, len(auth_to_ix)) print 'Top-%d Accuracy is %.2f' % (params['topk'], topk_train) print 'Accuracy per adjusted scores %.3f' % (100. * ( (adjusted_scores_tr[:n_train] >= 0.5).sum() + (adjusted_scores_tr[n_train:] < 0.5).sum()) / (2. * n_train)) print 'AUC is %.2f' % (auc_tr) print '------------- Val set-------------------' print 'Accuracy is %.2f, Mean rank is %.2f / %d' % ( val_accuracy, mean_rank_val, len(auth_to_ix)) print 'Top-%d Accuracy is %.2f' % (params['topk'], topk_val) print 'Accuracy per adjusted scores %.3f' % (100. * ( (adjusted_scores_val[:n_val] >= 0.5).sum() + (adjusted_scores_val[n_val:] < 0.5).sum()) / (2. * n_val)) print 'AUC is %.2f' % (auc_val) print '--------------------------------------------------------------------------' print '--------------------------------------------------------------------------\n\n'