def main(fold_path, prompt):
    
    # get data
    train_path = fold_path+'/train.tsv'
    dev_path = fold_path+'/dev.tsv'
    test_path = fold_path+'/test.tsv'
    
    train_essays, dev_essays, test_essays = \
            load_fold(train_path, dev_path, test_path, prompt)
        
    (train_x,train_scores,train_ids,train_pmt) = prepare_data(train_essays)
    (dev_x,dev_scores,dev_ids,dev_pmt) = prepare_data(dev_essays)
    (test_x,test_scores,test_ids,test_pmt) = prepare_data(test_essays)

    with open(fold_path+'/train_feat_p%s.pkl'%(str(prompt)),'rb') as f:
        train_feat = pickle.load(f)
        
    with open(fold_path+'/dev_feat_p%s.pkl'%(str(prompt)),'rb') as f:
        dev_feat = pickle.load(f)
        
    with open(fold_path+'/test_feat_p%s.pkl'%(str(prompt)),'rb') as f:
        test_feat = pickle.load(f)

    
    logger.info(" train_feat: %s, dev_feat: %s, test_feat: %s"%(str(train_feat.shape), str(dev_feat.shape), str(test_feat.shape)))
    
    train_feat =  preprocessing.scale(train_feat,axis=0)
    dev_feat =  preprocessing.scale(dev_feat,axis=0)
    test_feat =  preprocessing.scale(test_feat,axis=0)

    
    train_feat = list(train_feat)
    dev_feat = list(dev_feat)
    test_feat = list(test_feat)

    
    lang = make_lang(train_x,dev_x,test_x)
    

#    model_friendly_scores = get_model_friendly_scores(np.array(test_scores), np.array(test_pmt))
#
#    for i in range(len(test_x)):
#        essay = test_x[i]
#        model_friendly_score =model_friendly_scores[i]
#        if model_friendly_score > 0.0:
#            print
#            print test_ids[i]
#            print essay
#            print model_friendly_score
#            var = raw_input("Do you want to see the next essay (Y/N): ")
#            if var.lower() == 'n':
#                break 
#    return

    get_MAX_LENS(train_x,dev_x,test_x)
    
    if params['padding_level'] == 'document':
        logger.info( "train padding (document level) and indexing  ....")
        train_x, train_mask = convert_seq_to_indices(train_x, lang, params['max_doc_len'])
        logger.info( "dev padding (document level) and indexing ....")
        dev_x, dev_mask = convert_seq_to_indices(dev_x, lang, params['max_doc_len'])
        logger.info( "test padding (document level) and indexing ....")
        test_x, test_mask = convert_seq_to_indices(test_x, lang, params['max_doc_len'])
    
    elif params['padding_level'] == 'sentence':
        logger.info( "train padding (sentence level) and indexing ....")
        train_x, train_mask = convert_seq_to_indices(train_x, lang, params['max_sent_len'])
        logger.info( "dev padding (sentence level) and indexing ....")
        dev_x, dev_mask = convert_seq_to_indices(dev_x, lang, params['max_sent_len'])
        logger.info( "test padding (sentence level) and indexing ....")
        test_x, test_mask = convert_seq_to_indices(test_x, lang, params['max_sent_len'])
        
        logger.info( "make length of all sentences equal ....")
        sent_pad, out_mask, _ =  lang.indicies_from_sentence([], params['max_sent_len'],params['padding_place'])

        train_x, train_mask =  doc_padding_by_sent_pad(train_x, train_mask, lang)
        dev_x, dev_mask =  doc_padding_by_sent_pad(dev_x, dev_mask, lang)
        test_x, test_mask =  doc_padding_by_sent_pad(test_x, test_mask, lang)

        
    logger.info("prompt:%d"%prompt)
    
    logger.info("score: %d - %d"%(np.min(train_scores+dev_scores+test_scores),
                            np.max(train_scores+dev_scores+test_scores)))
    
    logger.info("score Med: %d"%(np.median(train_scores+dev_scores+test_scores)))
    
    train_scores = np.array(train_scores,dtype = 'float')
    dev_scores = np.array(dev_scores,dtype = 'float')
    test_scores = np.array(test_scores,dtype = 'float')
    
    train_pmt = np.array(train_pmt, dtype='int32')
    dev_pmt = np.array(dev_pmt, dtype='int32')
    test_pmt = np.array(test_pmt, dtype='int32')
    
    
    train_mean = train_scores.mean(axis=0)
    train_std = train_scores.std(axis=0)
    dev_mean = dev_scores.mean(axis=0)
    dev_std = dev_scores.std(axis=0)
    test_mean = test_scores.mean(axis=0)
    test_std = test_scores.std(axis=0)
    
    logger.info( "train_mean = %f , train_std: %f"%(train_mean , train_std))
    logger.info( "dev_mean %f , dev_std:%f"%(dev_mean , dev_std))
    logger.info( "dev_mean %f , dev_std:%f"%(test_mean , test_std))
    
    
    # Convert scores to boundary of [0 1] for training and evaluation (loss calculation)
    train_y = dataset.get_model_friendly_scores(train_scores, train_pmt)
    dev_y = dataset.get_model_friendly_scores(dev_scores, dev_pmt)
    test_y = dataset.get_model_friendly_scores(test_scores, test_pmt)

    
    train_samples = zip(train_x,train_mask,train_y,train_pmt,train_scores,train_feat)
    dev_samples = zip(dev_x,dev_mask,dev_y, dev_pmt, dev_scores,dev_feat)
    test_samples = zip(test_x,test_mask,test_y, test_pmt, test_scores,test_feat)
    
    
    ##########################
    ## model creation
    ##########################
    max_doc_len = params['max_doc_len']
    
    if params['padding_level']=='sentence':
        
         params['utt_size'] = params['max_sent_len']
         
    output_layer_size = train_feat[0].size # self.table == 4
         
    logger.info(" params:\n %s"%params)
    
    model = Model(max_doc_len=max_doc_len, 
                  relation_size=params['relation_size'],
                  lstm_size = params['lstm_size'], 
                  voc_size = params['voc_size'], 
                  emb_size=params['emb_size'], 
                  dropout_rate=params['dropout_rate'],
                  embeddings = lang.embeddings,
                  mean_y = train_y.mean(axis=0),
                  pad_idx = lang.PAD_index,
                  utt_size =  params['utt_size'],
                  mode = params['model'],
                  table = params['result_table'],
                  output_layer_size = output_layer_size)
    
    
    if params['RUN_ON_GPU'] and cuda.is_available():
        model = model.cuda()

    #############################
    ## start training
    #############################    
    if params['RUN_ON_GPU'] and cuda.is_available():    
        hist_epoch,hist_loss,hist_qwk_train,hist_qwk_dev, qwk_test = \
                                train(trainset_samples=train_samples,
                                     devset_samples=dev_samples,
                                     testset_samples = test_samples,
                                     model=model,
                                     prompt=prompt,
                                     batch_size=params['batch_size'],
                                     lang=lang)
                                
    else:
        hist_epoch,hist_loss,hist_qwk_train,hist_qwk_dev, qwk_test = \
                            train(trainset_samples = train_samples[:10],
                                 devset_samples = train_samples[:10],
                                 testset_samples = test_samples[:10],
                                 model=model,
                                 prompt=prompt,
                                 batch_size=params['batch_size'],
                                 lang=lang)
                            
                            
    return  qwk_test, model
Пример #2
0
# !!!!! This part is unused !!!!!

logger.info('  train_x shape: ' + str(np.array(train_x).shape))
logger.info('  dev_x shape:   ' + str(np.array(dev_x).shape))
logger.info('  test_x shape:  ' + str(np.array(test_x).shape))

logger.info('  train_y shape: ' + str(train_y.shape))
logger.info('  dev_y shape:   ' + str(dev_y.shape))
logger.info('  test_y shape:  ' + str(test_y.shape))

# We need the dev and test sets in the original scale for evaluation
dev_y_org = dev_y.astype(dataset.get_ref_dtype())
test_y_org = test_y.astype(dataset.get_ref_dtype())

# Convert scores to boundary of [0 1] for training and evaluation (loss calculation)
train_y = dataset.get_model_friendly_scores(train_y, train_pmt)
dev_y = dataset.get_model_friendly_scores(dev_y, dev_pmt)
test_y = dataset.get_model_friendly_scores(test_y, test_pmt)

###############################################################################################################################
## Optimizaer algorithm
#

from nea.optimizers import get_optimizer

optimizer = get_optimizer(args)

###############################################################################################################################
## Building model
#
Пример #3
0
def train(args):
    out_dir = args.out_dir_path
    U.mkdir_p(out_dir + '/preds')
    timestr = U.set_logger(onscreen=args.onscreen, out_dir=out_dir)
    U.print_args(args)

    # 	assert args.model_type in {'mlp', 'cls', 'clsp', 'reg', 'regp', 'breg', 'bregp'}
    assert args.model_type in {'cls', 'reg'}
    assert args.algorithm in {
        'rmsprop', 'sgd', 'adagrad', 'adadelta', 'adam', 'adamax'
    }
    assert args.loss in {'mse', 'mae', 'cnp', 'hng'}
    assert args.recurrent_unit in {'lstm', 'gru', 'simple'}
    assert args.aggregation in {'mot', 'att'}

    if args.seed > 0:
        np.random.seed(args.seed)

    from nea.asap_evaluator import Evaluator
    import nea.asap_reader as dataset

    ###############################################################################################################################
    ## Prepare data
    #

    from keras.preprocessing import sequence

    if args.valid_split > 0:
        (train_x, train_y,
         train_pmt), (test_x, test_y,
                      test_pmt), vocab, overal_maxlen = dataset.get_data(
                          (args.train_path, args.test_path),
                          args.prompt_id,
                          args.vocab_size,
                          args.maxlen,
                          tokenize_text=True,
                          to_lower=True,
                          sort_by_len=False,
                          vocab_path=args.vocab_path)
    else:
        # data_x is a list of lists
        (train_x, train_y, train_pmt), (dev_x, dev_y, dev_pmt), (
            test_x, test_y, test_pmt), vocab, overal_maxlen = dataset.get_data(
                (args.train_path, args.dev_path, args.test_path),
                args.prompt_id,
                args.vocab_size,
                args.maxlen,
                tokenize_text=True,
                to_lower=True,
                sort_by_len=False,
                vocab_path=args.vocab_path)

    if args.pre_train_path:
        if args.valid_split == 0:
            args.valid_split = 0.2
        (pre_train_x, pre_train_y,
         pre_train_pmt), _, _, pre_overal_maxlen = dataset.get_data(
             (args.pre_train_path, args.test_path),
             args.prompt_id,
             args.vocab_size,
             args.maxlen,
             tokenize_text=True,
             to_lower=True,
             sort_by_len=False,
             vocab_path=args.vocab_path)
        overal_maxlen = max(overal_maxlen, pre_overal_maxlen)

    if args.tfidf > 0:
        train_pca, TfIdf, Pca = dataset.get_tfidf(args.train_path,
                                                  args.prompt_id,
                                                  pca_dim=args.tfidf,
                                                  training_material=True)
        if args.valid_split == 0:
            dev_pca, _, _ = dataset.get_tfidf(args.dev_path,
                                              args.prompt_id,
                                              pca_dim=args.tfidf,
                                              tfidf=TfIdf,
                                              pca=Pca,
                                              training_material=False)
        test_pca, _, _ = dataset.get_tfidf(args.test_path,
                                           args.prompt_id,
                                           pca_dim=args.tfidf,
                                           tfidf=TfIdf,
                                           pca=Pca,
                                           training_material=False)
    else:
        dev_pca = None
        test_pca = None

    if args.features:
        train_ftr = dataset.get_features(args.train_path,
                                         args.train_feature_path,
                                         args.prompt_id)
        if args.valid_split == 0:
            valid_ftr = dataset.get_features(args.dev_path,
                                             args.dev_feature_path,
                                             args.prompt_id)
        test_ftr = dataset.get_features(args.test_path, args.test_feature_path,
                                        args.prompt_id)
    else:
        test_ftr = None

    if not args.vocab_path:
        # Dump vocab
        with open(out_dir + '/vocab.pkl', 'wb') as vocab_file:
            pk.dump(vocab, vocab_file)

    # Pad sequences for mini-batch processing
# 	if args.model_type in {'breg', 'bregp', 'clsp', 'cls', 'mlp'}:
# 	assert args.rnn_dim > 0
# 	assert args.recurrent_unit == 'lstm'
    train_x = sequence.pad_sequences(train_x, maxlen=overal_maxlen)
    if args.valid_split == 0:
        dev_x = sequence.pad_sequences(dev_x, maxlen=overal_maxlen)
    if args.pre_train_path:
        pre_train_x = sequence.pad_sequences(pre_train_x, maxlen=overal_maxlen)
    test_x = sequence.pad_sequences(test_x, maxlen=overal_maxlen)
    # 	else:
    # 		train_x = sequence.pad_sequences(train_x)
    # 		dev_x = sequence.pad_sequences(dev_x)
    # 		test_x = sequence.pad_sequences(test_x)

    ###############################################################################################################################
    ## Some statistics
    #

    import keras.backend as K

    train_y = np.array(train_y, dtype=K.floatx())
    if args.valid_split == 0:
        dev_y = np.array(dev_y, dtype=K.floatx())
    if args.pre_train_path:
        pre_train_y = np.array(pre_train_y, dtype=K.floatx())
    test_y = np.array(test_y, dtype=K.floatx())

    if args.prompt_id >= 0:
        train_pmt = np.array(train_pmt, dtype='int32')
        if args.valid_split == 0:
            dev_pmt = np.array(dev_pmt, dtype='int32')
        test_pmt = np.array(test_pmt, dtype='int32')

    # count score distribution
    bincounts, mfs_list = U.bincounts(train_y)
    with open('%s/bincounts.txt' % out_dir, 'w') as output_file:
        for bincount in bincounts:
            output_file.write(str(bincount) + '\n')

    train_mean = train_y.mean(axis=0)
    train_std = train_y.std(axis=0)
    train_max = train_y.max(axis=0)
    train_min = train_y.min(axis=0)
    # 	dev_mean = dev_y.mean(axis=0)
    # 	dev_std = dev_y.std(axis=0)
    # 	test_mean = test_y.mean(axis=0)
    # 	test_std = test_y.std(axis=0)

    logger.info('Statistics:')

    logger.info('  train_x shape: ' + str(np.array(train_x).shape))
    if args.valid_split == 0:
        logger.info('  dev_x shape:   ' + str(np.array(dev_x).shape))
    logger.info('  test_x shape:  ' + str(np.array(test_x).shape))

    logger.info('  train_y shape: ' + str(train_y.shape))
    if args.valid_split == 0:
        logger.info('  dev_y shape:   ' + str(dev_y.shape))
    logger.info('  test_y shape:  ' + str(test_y.shape))

    logger.info(
        '  train_y max: %d, min: %d, mean: %.2f, stdev: %.3f, MFC: %s' %
        (train_max, train_min, train_mean, train_std, str(mfs_list)))
    logger.info('  train_y statistic: %s' % (str(bincounts[0]), ))

    # We need the dev and test sets in the original scale for evaluation
    # 	if args.valid_split == 0:
    # 		dev_y_org = dev_y.astype(dataset.get_ref_dtype())
    test_y_org = test_y.astype(dataset.get_ref_dtype())

    if "reg" in args.model_type:
        if args.normalize:
            logger.info('  normalize score to range (0,1)')
            # Convert scores to boundary of [0 1] for training and evaluation (loss calculation)
            train_y = dataset.get_model_friendly_scores(train_y, train_pmt)
            if args.valid_split == 0:
                dev_y = dataset.get_model_friendly_scores(dev_y, dev_pmt)
            test_y = dataset.get_model_friendly_scores(test_y, test_pmt)
    else:
        logger.info('  covert train_y to one hot shape')
        assert len(bincounts) == 1, "support only one y value"
        categ = int(max(bincounts[0].keys())) + 1
        if args.pre_train_path:
            categ = 5
        # covert to np array to minus 1 to get zero based value
        train_y = to_categorical(train_y, categ)
        if args.valid_split == 0:
            dev_y = to_categorical(dev_y, categ)
        if args.pre_train_path:
            pre_train_y = to_categorical(pre_train_y, categ)
        test_y = to_categorical(test_y, categ)

    ###############################################################################################################################
    ## Optimizer algorithm
    #

    from nea.optimizers import get_optimizer
    optimizer = get_optimizer(args)

    ###############################################################################################################################
    ## Building model
    #

    if "reg" in args.model_type:
        logger.info('  use regression model')
        final_categ = train_y.mean(axis=0)
        if args.loss == 'mae':
            loss = 'mean_absolute_error'
            metric = 'mean_squared_error'
        elif args.loss == 'mse':
            loss = 'mean_squared_error'
            metric = 'mean_absolute_error'
        else:
            raise NotImplementedError
    else:
        logger.info('  use classification model')
        final_categ = categ
        if args.loss == 'cnp':
            loss = 'categorical_crossentropy'
            metric = 'categorical_accuracy'
        elif args.loss == 'hng':
            loss = 'hinge'
            metric = 'squared_hinge'
        else:
            raise NotImplementedError

    from nea.models import create_model
    model = create_model(args, final_categ, overal_maxlen, vocab)
    model.compile(loss=loss, optimizer=optimizer, metrics=[metric])

    if args.onscreen: model.summary()

    ###############################################################################################################################
    ## Plotting model
    #

    from keras.utils.visualize_util import plot
    plot(model, to_file=out_dir + '/' + timestr + 'model_plot.png')

    ###############################################################################################################################
    ## Save model architecture
    #

    logger.info('Saving model architecture')
    with open(out_dir + '/' + timestr + 'model_config.json', 'w') as arch:
        arch.write(model.to_json(indent=2))
    logger.info('  Done')

    ###############################################################################################################################
    ## Initialize Evaluator
    #

    evl = Evaluator(args,
                    out_dir,
                    timestr,
                    metric,
                    test_x,
                    test_y,
                    test_y_org,
                    test_pmt,
                    test_pca=test_pca,
                    test_ftr=test_ftr)
    earlystop = EarlyStopping(patience=args.earlystop, verbose=1, mode='auto')

    ###############################################################################################################################
    ## Training
    #

    logger.info(
        '------------------------------------------------------------------------------------------'
    )
    logger.info('Initial Evaluation:')
    evl.eval(model, -1, print_info=True)

    model_train_x = [
        train_x,
    ]
    if not args.valid_split:
        model_dev_x = [
            dev_x,
        ]
    if args.tfidf > 0:
        model_train_x.append(train_pca)
        if not args.valid_split:
            model_dev_x.append(dev_pca)
    if args.features:
        model_train_x.append(train_ftr)
        if not args.valid_split:
            model_dev_x.append(valid_ftr)

    if args.pre_train_path:
        model.fit(pre_train_x,
                  pre_train_y,
                  validation_split=0.12,
                  batch_size=args.batch_size,
                  nb_epoch=args.pre_epochs,
                  verbose=args.verbose,
                  callbacks=[earlystop, evl])

    if args.valid_split > 0:
        model.fit(model_train_x,
                  train_y,
                  validation_split=args.valid_split,
                  batch_size=args.batch_size,
                  nb_epoch=args.epochs,
                  verbose=args.verbose,
                  callbacks=[earlystop, evl])
    else:
        model.fit(model_train_x,
                  train_y,
                  validation_data=(model_dev_x, dev_y),
                  batch_size=args.batch_size,
                  nb_epoch=args.epochs,
                  verbose=args.verbose,
                  callbacks=[earlystop, evl])

    return evl.print_final_info()