예제 #1
0
def load_eval_metrics(config):
    eval_metrics = OrderedDict()
    for mobj in config['metrics']:
        mobj = mobj.lower()
        if '@' in mobj:
            mt_key, mt_val = mobj.split('@', 1)
            eval_metrics[mobj] = metrics.get(mt_key)(int(mt_val))
        else:
            eval_metrics[mobj] = metrics.get(mobj)
    return eval_metrics
예제 #2
0
    def similar_items(self, item, metric='euclidean', n=50):
        # Metric jump table
        metrics = {
            'euclidean': metrics.euclidean_distance,
            'pearson': metrics.pearson_correlation,
        }

        distance = metrics.get(metric, None)
        ratings = pd.DataFrame(list(self.rating_service.get_all()))

        # Handle problems that might occur
        if item not in ratings['movie_id']:
            raise KeyError("Unknown item, '%s'." % item)
        if not distance or not callable(distance):
            raise KeyError("Unknown or unprogrammed distance metric '%s'." %
                           metric)

        similar_items = {}
        for similar_item in ratings['movie_id']:
            if similar_item == item:
                continue

            similar_items[similar_item] = distance(
                self.recommender_service.get_shared_preferences(
                    similar_item['user_id'], item['user_id']))

        return heapq.nlargest(n, items.items(), key=itemgetter(1))
예제 #3
0
def train(config, para_path):

    print(json.dumps(config, indent=2), end='\n')
    # read basic config
    global_conf = config["global"]
    optimizer = global_conf['optimizer']
    optimizer = optimizers.get(optimizer)
    K.set_value(optimizer.lr, global_conf['learning_rate'])
    #weights_file = str(global_conf['weights_file']) + '.%d'
    display_interval = int(global_conf['display_interval'])
    num_iters = int(global_conf['num_iters'])
    save_weights_iters = int(global_conf['save_weights_iters'])

    # read input config
    input_conf = config['inputs']
    share_input_conf = input_conf['share']

    #an_config = json.load(open('./data/pinfo/config.py', 'r'))
    an_config = json.load(open(para_path, 'r'))
    dstdir = an_config["model_dst_dir"]
    if not os.path.exists(an_config['weights_dir']):
        os.mkdir(an_config['weights_dir'])
    weights_file = an_config['weights_dir'] + str(global_conf['weights_file'])
    config['metrics'] = an_config["metrics"]
    config['model']['model_path'] = an_config['model_path']

    share_input_conf['embed_path'] = dstdir + "embed_glove_d300"
    share_input_conf[
        'word_triletter_map_file'] = dstdir + "word_triletter_map.txt"
    share_input_conf['vocab_size'] = word_len(dstdir + "word_dict.txt")
    share_input_conf['text1_corpus'] = dstdir + "corpus_preprocessed.txt"
    share_input_conf['text2_corpus'] = dstdir + "corpus_preprocessed.txt"
    input_conf['train']['relation_file'] = dstdir + "relation_train.txt"
    input_conf['valid']['relation_file'] = dstdir + "relation_valid.txt"
    input_conf['test']['relation_file'] = dstdir + "relation_test.txt"
    input_conf['train'][
        'hist_feats_file'] = dstdir + "relation_train.binsum-20.txt"
    input_conf['valid'][
        'hist_feats_file'] = dstdir + "relation_valid.binsum-20.txt"
    input_conf['test'][
        'hist_feats_file'] = dstdir + "relation_test.binsum-20.txt"

    # collect embedding
    if 'embed_path' in share_input_conf:
        embed_dict = read_embedding(filename=share_input_conf['embed_path'])
        _PAD_ = share_input_conf['vocab_size'] - 1
        embed_dict[_PAD_] = np.zeros((share_input_conf['embed_size'], ),
                                     dtype=np.float32)
        embed = np.float32(
            np.random.uniform(-0.2, 0.2, [
                share_input_conf['vocab_size'], share_input_conf['embed_size']
            ]))
        share_input_conf['embed'] = convert_embed_2_numpy(embed_dict,
                                                          embed=embed)
    else:
        # if no embed provided, use random
        embed = np.float32(
            np.random.uniform(-0.2, 0.2, [
                share_input_conf['vocab_size'], share_input_conf['embed_size']
            ]))
        share_input_conf['embed'] = embed
    print('[Embedding] Embedding Load Done.', end='\n')

    # list all input tags and construct tags config
    input_train_conf = OrderedDict()
    input_eval_conf = OrderedDict()
    # print("input_conf", input_conf)
    # print("input_conf keys", input_conf.keys())
    for tag in input_conf.keys():
        if 'phase' not in input_conf[tag]:
            continue
        if input_conf[tag]['phase'] == 'TRAIN':
            input_train_conf[tag] = {}
            input_train_conf[tag].update(share_input_conf)
            input_train_conf[tag].update(input_conf[tag])
        elif input_conf[tag]['phase'] == 'EVAL':
            input_eval_conf[tag] = {}
            input_eval_conf[tag].update(share_input_conf)
            input_eval_conf[tag].update(input_conf[tag])
    print('[Input] Process Input Tags. %s in TRAIN, %s in EVAL.' %
          (input_train_conf.keys(), input_eval_conf.keys()),
          end='\n')
    # print("input_train_conf", input_train_conf)
    # collect dataset identification
    dataset = {}
    for tag in input_conf:
        if tag != 'share' and input_conf[tag]['phase'] == 'PREDICT':
            continue
        if 'text1_corpus' in input_conf[tag]:
            datapath = input_conf[tag]['text1_corpus']
            if datapath not in dataset:
                dataset[datapath], _ = read_data(datapath)
        if 'text2_corpus' in input_conf[tag]:
            datapath = input_conf[tag]['text2_corpus']
            if datapath not in dataset:
                dataset[datapath], _ = read_data(datapath)
    print('[Dataset] %s Dataset Load Done.' % len(dataset), end='\n')

    # initial data generator
    train_gen = OrderedDict()
    eval_gen = OrderedDict()

    for tag, conf in input_train_conf.items():
        print(conf, end='\n')
        conf['data1'] = dataset[conf['text1_corpus']]
        conf['data2'] = dataset[conf['text2_corpus']]
        generator = inputs.get(conf['input_type'])
        train_gen[tag] = generator(config=conf)

    for tag, conf in input_eval_conf.items():
        print(conf, end='\n')
        conf['data1'] = dataset[conf['text1_corpus']]
        conf['data2'] = dataset[conf['text2_corpus']]
        generator = inputs.get(conf['input_type'])
        eval_gen[tag] = generator(config=conf)

    ######### Load Model #########
    model = load_model(config)
    # weights_file1 = str(global_conf['weights_file']) + '.' + str(global_conf['test_weights_iters'])
    # model.load_weights(weights_file1)
    loss = []
    for lobj in config['losses']:
        if lobj['object_name'] in mz_specialized_losses:
            loss.append(
                rank_losses.get(lobj['object_name'])(lobj['object_params']))
        else:
            loss.append(rank_losses.get(lobj['object_name']))
    eval_metrics = OrderedDict()
    for mobj in config['metrics']:
        mobj = mobj.lower()
        if '@' in mobj:
            mt_key, mt_val = mobj.split('@', 1)
            eval_metrics[mobj] = metrics.get(mt_key)(int(mt_val))
        else:
            eval_metrics[mobj] = metrics.get(mobj)
    model.compile(optimizer=optimizer, loss=loss)

    print('[Model] Model Compile Done.', end='\n')

    base_metric = an_config["base_metric"]
    best_epoch = 0
    best_metric = -1
    best_result = ''
    start_time = time.clock()
    for i_e in range(num_iters):
        for tag, generator in train_gen.items():
            genfun = generator.get_batch_generator()
            print('[%s]\t[Train:%s] ' % (time.strftime(
                '%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag),
                  end='')
            history = model.fit_generator(genfun,
                                          steps_per_epoch=display_interval,
                                          epochs=1,
                                          shuffle=False,
                                          verbose=0)  #callbacks=[eval_map])
            print('Iter:%d\tloss=%.6f' % (i_e, history.history['loss'][0]),
                  end='\n')

        for tag, generator in eval_gen.items():
            genfun = generator.get_batch_generator()
            print('[%s]\t[Eval:%s] ' % (time.strftime(
                '%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag),
                  end='')
            res = dict([[k, 0.] for k in eval_metrics.keys()])
            num_valid = 0
            for input_data, y_true in genfun:
                y_pred = model.predict(input_data, batch_size=len(y_true))
                if issubclass(type(generator),
                              inputs.list_generator.ListBasicGenerator):
                    list_counts = input_data['list_counts']
                    for k, eval_func in eval_metrics.items():
                        for lc_idx in range(len(list_counts) - 1):
                            pre = list_counts[lc_idx]
                            suf = list_counts[lc_idx + 1]
                            res[k] += eval_func(y_true=y_true[pre:suf],
                                                y_pred=y_pred[pre:suf])
                    num_valid += len(list_counts) - 1
                else:
                    for k, eval_func in eval_metrics.items():
                        res[k] += eval_func(y_true=y_true, y_pred=y_pred)
                    num_valid += 1
            generator.reset()
            print('Iter:%d\t%s' % (i_e, '\t'.join(
                ['%s=%f' % (k, v / num_valid) for k, v in res.items()])),
                  end='\n')
            cur_metric = res[base_metric] / num_valid
            cur_res = '\t'.join(
                ['%s=%f' % (k, v / num_valid) for k, v in res.items()])
            cur_metric_ls = {}
            for k, v in res.items():
                cur_metric_ls[k] = round(v / num_valid, 4)
            if cur_metric > best_metric and tag == 'valid':
                best_epoch = i_e
                best_metric = cur_metric
                best_result = cur_res
                best_metric_ls = cur_metric_ls
                model.save_weights(weights_file)
            sys.stdout.flush()
        #if (i_e+1) % save_weights_iters == 0:
        #model.save_weights(weights_file % (i_e+1))
    end_time = time.clock()
    print('the best running result %s the best epoch %d' %
          (best_result, best_epoch))
    print('the running time %s seconds' % (end_time - start_time))
    db.insert_result(task_id, model_id, best_metric_ls)
예제 #4
0
def predict(config):
    ######## Read input config ########

    print(json.dumps(config, indent=2), end='\n')
    input_conf = config['inputs']
    share_input_conf = input_conf['share']

    # collect embedding
    if 'embed_path' in share_input_conf:
        embed_dict = read_embedding(filename=share_input_conf['embed_path'])
        _PAD_ = share_input_conf['vocab_size'] - 1
        embed_dict[_PAD_] = np.zeros((share_input_conf['embed_size'], ),
                                     dtype=np.float32)
        embed = np.float32(
            np.random.uniform(-0.02, 0.02, [
                share_input_conf['vocab_size'], share_input_conf['embed_size']
            ]))
        share_input_conf['embed'] = convert_embed_2_numpy(embed_dict,
                                                          embed=embed)
    else:
        embed = np.float32(
            np.random.uniform(-0.2, 0.2, [
                share_input_conf['vocab_size'], share_input_conf['embed_size']
            ]))
        share_input_conf['embed'] = embed
    print('[Embedding] Embedding Load Done.', end='\n')

    # list all input tags and construct tags config
    input_predict_conf = OrderedDict()
    for tag in input_conf.keys():
        if 'phase' not in input_conf[tag]:
            continue
        if input_conf[tag]['phase'] == 'PREDICT':
            input_predict_conf[tag] = {}
            input_predict_conf[tag].update(share_input_conf)
            input_predict_conf[tag].update(input_conf[tag])
    print('[Input] Process Input Tags. %s in PREDICT.' %
          (input_predict_conf.keys()),
          end='\n')

    # collect dataset identification
    dataset = {}
    for tag in input_conf:
        if tag == 'share' or input_conf[tag]['phase'] == 'PREDICT':
            if 'text1_corpus' in input_conf[tag]:
                datapath = input_conf[tag]['text1_corpus']
                if datapath not in dataset:
                    dataset[datapath], _ = read_data(datapath)
            if 'text2_corpus' in input_conf[tag]:
                datapath = input_conf[tag]['text2_corpus']
                if datapath not in dataset:
                    dataset[datapath], _ = read_data(datapath)
    print('[Dataset] %s Dataset Load Done.' % len(dataset), end='\n')

    # initial data generator
    predict_gen = OrderedDict()

    for tag, conf in input_predict_conf.items():
        print(conf, end='\n')
        conf['data1'] = dataset[conf['text1_corpus']]
        conf['data2'] = dataset[conf['text2_corpus']]
        generator = inputs.get(conf['input_type'])
        predict_gen[tag] = generator(
            #data1 = dataset[conf['text1_corpus']],
            #data2 = dataset[conf['text2_corpus']],
            config=conf)

    ######## Read output config ########
    output_conf = config['outputs']

    ######## Load Model ########
    global_conf = config["global"]
    weights_file = str(global_conf['weights_file']) + '.' + str(
        global_conf['test_weights_iters'])

    model = load_model(config)
    model.load_weights(weights_file)
    encoder = Model(inputs=model.input,
                    outputs=[
                        model.get_layer('att_layer_2').output,
                        model.get_layer('att_layer_3').output
                    ])
    # encoder = Model(inputs=model.input, outputs=model.get_layer('att_layer_2').output)
    eval_metrics = OrderedDict()
    for mobj in config['metrics']:
        mobj = mobj.lower()
        if '@' in mobj:
            mt_key, mt_val = mobj.split('@', 1)
            eval_metrics[mobj] = metrics.get(mt_key)(int(mt_val))
        else:
            eval_metrics[mobj] = metrics.get(mobj)
    res = dict([[k, 0.] for k in eval_metrics.keys()])

    # 这是为了打印query和sent的attention score
    for tag, generator in predict_gen.items():
        genfun = generator.get_batch_generator()
        print('[%s]\t[Predict] @ %s ' % (time.strftime(
            '%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag),
              end='')
        num_valid = 0
        res_scores = {}
        for input_data, y_true in genfun:

            y_pred1, y_pred2 = encoder.predict(input_data,
                                               batch_size=len(y_true))
            y_pred1 = _to_list(np.squeeze(y_pred1).tolist())
            y_pred2 = _to_list(np.squeeze(y_pred2).tolist())
            # print("y_pred", len(y_pred), len(y_pred[0]))
            print(input_data)
            print("sent", y_pred1)
            print("query", y_pred2)
            print()

    for tag, generator in predict_gen.items():
        genfun = generator.get_batch_generator()
        print('[%s]\t[Predict] @ %s ' % (time.strftime(
            '%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag),
              end='')
        num_valid = 0
        res_scores = {}
        for input_data, y_true in genfun:
            y_pred = model.predict(input_data, batch_size=len(y_true))

            if issubclass(type(generator),
                          inputs.list_generator.ListBasicGenerator):
                list_counts = input_data['list_counts']
                for k, eval_func in eval_metrics.items():
                    for lc_idx in range(len(list_counts) - 1):
                        pre = list_counts[lc_idx]
                        suf = list_counts[lc_idx + 1]
                        res[k] += eval_func(y_true=y_true[pre:suf],
                                            y_pred=y_pred[pre:suf])

                y_pred = np.squeeze(y_pred)
                for lc_idx in range(len(list_counts) - 1):
                    pre = list_counts[lc_idx]
                    suf = list_counts[lc_idx + 1]
                    for p, y, t in zip(input_data['ID'][pre:suf],
                                       y_pred[pre:suf], y_true[pre:suf]):
                        if p[0] not in res_scores:
                            res_scores[p[0]] = {}
                        res_scores[p[0]][p[1]] = (y, t)

                num_valid += len(list_counts) - 1
            else:
                for k, eval_func in eval_metrics.items():
                    res[k] += eval_func(y_true=y_true, y_pred=y_pred)
                for p, y, t in zip(input_data['ID'], y_pred, y_true):
                    if p[0] not in res_scores:
                        res_scores[p[0]] = {}
                    res_scores[p[0]][p[1]] = (y[1], t[1])
                num_valid += 1
        generator.reset()

        if tag in output_conf:
            if output_conf[tag]['save_format'] == 'TREC':
                with open(output_conf[tag]['save_path'], 'w') as f:
                    for qid, dinfo in res_scores.items():
                        dinfo = sorted(dinfo.items(),
                                       key=lambda d: d[1][0],
                                       reverse=True)
                        for inum, (did, (score, gt)) in enumerate(dinfo):
                            f.write('%s\tQ0\t%s\t%d\t%f\t%s\t%s\n' %
                                    (qid, did, inum, score, config['net_name'],
                                     gt))
            elif output_conf[tag]['save_format'] == 'TEXTNET':
                with open(output_conf[tag]['save_path'], 'w') as f:
                    for qid, dinfo in res_scores.items():
                        dinfo = sorted(dinfo.items(),
                                       key=lambda d: d[1][0],
                                       reverse=True)
                        for inum, (did, (score, gt)) in enumerate(dinfo):
                            f.write('%s %s %s %s\n' % (gt, qid, did, score))

        print('[Predict] results: ',
              '\t'.join(['%s=%f' % (k, v / num_valid)
                         for k, v in res.items()]),
              end='\n')
        sys.stdout.flush()
예제 #5
0
model = ANMM(model_config).build()

loss = []
for lobj in config['losses']:
    if lobj['object_name'] in mz_specialized_losses:
        loss.append(
            rank_losses.get(lobj['object_name'])(lobj['object_params']))
    else:
        loss.append(rank_losses.get(lobj['object_name']))
eval_metrics = OrderedDict()
for mobj in config['metrics']:
    mobj = mobj.lower()
    if '@' in mobj:
        mt_key, mt_val = mobj.split('@', 1)
        eval_metrics[mobj] = metrics.get(mt_key)(int(mt_val))
    else:
        eval_metrics[mobj] = metrics.get(mobj)
model.compile(optimizer=optimizer, loss=loss)
print('[Model] Model Compile Done.', end='\n')


def evaluate():
    i = 0
    for tag, generator in eval_gen.items():
        if (i == 1):
            continue
        genfun = generator.get_batch_generator()
        print('[%s]\t[Eval:%s] ' % (time.strftime(
            '%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag),
              end='')
예제 #6
0
def predict(config):
    print(json.dumps(config, indent=2))

    # read basic config
    global_conf = config["global"]
    optimizer = global_conf['optimizer']

    # read input config
    input_conf = config['inputs']
    share_input_conf = input_conf['share']

    # collect embedding
    assert 'embed_path' in share_input_conf
    embed_dict, vocab_size, embed_size, word_dict, idf_dict = read_embedding(
        share_input_conf['embed_path'])
    share_input_conf['word_dict'] = word_dict

    share_input_conf['vocab_size'] = vocab_size
    share_input_conf['embed_size'] = embed_size
    embed = np.float32(np.random.uniform(-9, 9, [vocab_size, embed_size]))
    embed_normalize = False
    if 'drmm' in config['model']['model_py'].lower():
        embed_normalize = True
    share_input_conf['embed'] = convert_embed_2_numpy(
        'embed', embed_dict=embed_dict, embed=embed, normalize=embed_normalize)
    idf = np.float32(np.random.uniform(4, 9, [vocab_size, 1]))
    share_input_conf['idf_feat'] = convert_embed_2_numpy('idf',
                                                         embed_dict=idf_dict,
                                                         embed=idf,
                                                         normalize=False)
    print '[%s]' % time.strftime(
        "%Y-%m-%d %H:%M:%S",
        time.localtime()), '[Embedding] Embedding Load Done.'

    # list all input tags and construct tags config
    input_eval_conf = OrderedDict()
    for tag in input_conf.keys():
        if 'phase' not in input_conf[tag]:
            continue
        if input_conf[tag]['phase'] == 'TRAIN':
            continue
        elif input_conf[tag]['phase'] == 'EVAL':
            input_eval_conf[tag] = {}
            input_eval_conf[tag].update(share_input_conf)
            input_eval_conf[tag].update(input_conf[tag])
    print '[%s]' % time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
    print '[Input] Process Input Tags. %s in EVAL.' % (input_eval_conf.keys())

    # initial data generator
    eval_gen = OrderedDict()

    for tag, conf in input_eval_conf.items():
        generator = inputs.get(conf['input_type'])
        eval_gen[tag] = generator(config=conf)

    ######### Load Model #########
    _model = load_model(config)
    # model = multi_gpu_model(_model, gpus=2)
    model = _model

    if 'load_weights_path' in global_conf:
        model.load_weights(global_conf['load_weights_path'])
    else:
        print 'no load_weights_path'
        exit(0)

    loss = []
    for lobj in config['losses']:
        loss.append(rank_losses.get(lobj))
    eval_metrics = OrderedDict()
    for mobj in config['metrics']:
        mobj = mobj.lower()
        if '@' in mobj:
            mt_key, mt_val = mobj.split('@', 1)
            eval_metrics[mobj] = metrics.get(mt_key)(int(mt_val))
        else:
            eval_metrics[mobj] = metrics.get(mobj)
    model.compile(optimizer=optimizer, loss=loss)
    print '[%s]' % time.strftime(
        "%Y-%m-%d %H:%M:%S", time.localtime()), '[Model] Model Compile Done.\n'

    print '\n### Model Info ###'
    model.summary()
    print '### Model Info ###\n'

    for i_e in range(1):
        for tag, generator in eval_gen.items():
            output_dir = config['net_name'].split('_')[0]
            output = open(
                '../output/%s/%s_%s_predict_output_%s.txt' %
                (output_dir, config['net_name'], tag, str(i_e + 1)), 'w')
            qid_uid_rel_score = {}
            qid_uid_score = {}
            genfun = generator.get_batch_generator()

            for input_data, y_true, curr_batch in genfun:
                y_pred = model.predict(input_data, batch_size=len(y_true))
                y_pred_reshape = np.reshape(y_pred, (len(y_pred), ))
                # output the predict scores
                for (q, d, label), score in zip(curr_batch, y_pred_reshape):
                    output.write('%s\t%s\t%s\t%s\n' %
                                 (str(q), str(d), str(label), str(score)))

                    if q not in qid_uid_score:
                        qid_uid_score[q] = {}
                    qid_uid_score[q][d] = score

                    if q not in qid_uid_rel_score:
                        qid_uid_rel_score[q] = dict(label=list(), score=list())
                    qid_uid_rel_score[q]['label'].append(label)
                    qid_uid_rel_score[q]['score'].append(score)

            output.close()
            # calculate the metrices
            res = dict([[k, 0.] for k in eval_metrics.keys()])
            for k, eval_func in eval_metrics.items():
                for qid in qid_uid_rel_score:
                    res[k] += eval_func(y_true=qid_uid_rel_score[qid]['label'],
                                        y_pred=qid_uid_rel_score[qid]['score'])
                res[k] /= len(qid_uid_rel_score)

            if 'test' in tag:
                print '[%s]' % time.strftime("%Y-%m-%d %H:%M:%S",
                                             time.localtime()),
                print '[Eval] @ epoch: %d,' % (i_e + 1),
                print ', '.join(['%s: %.5f' % (k, res[k]) for k in res])
            else:
                # calculate the eval_loss
                all_pairs = generator.get_all_pairs()
                all_pairs_rel_score = {}
                for qid, dp_id, dn_id in all_pairs:
                    all_pairs_rel_score[(qid, dp_id, dn_id)] = {}
                    all_pairs_rel_score[(qid, dp_id, dn_id)]['score'] = [
                        qid_uid_score[qid][dp_id], qid_uid_score[qid][dn_id]
                    ]
                    all_pairs_rel_score[(qid, dp_id,
                                         dn_id)]['rel'] = all_pairs[(qid,
                                                                     dp_id,
                                                                     dn_id)]

                eval_loss = cal_eval_loss(all_pairs_rel_score, tag,
                                          config['losses'])

                print '[%s]' % time.strftime("%Y-%m-%d %H:%M:%S",
                                             time.localtime()),
                print '[Eval] @ epoch: %d,' % (i_e + 1),
                print ', '.join(
                    ['%s: %.5f' % (k, eval_loss[k]) for k in eval_loss]),
                print ', '.join(['%s: %.5f' % (k, res[k]) for k in res])

        print ''
예제 #7
0
def train(config):

    print(json.dumps(config, indent=2))
    # read basic config
    global_conf = config["global"]
    optimizer = global_conf['optimizer']
    weights_file = str(global_conf['weights_file']) + '.%d'
    display_interval = int(global_conf['display_interval'])
    num_iters = int(global_conf['num_iters'])
    save_weights_iters = int(global_conf['save_weights_iters'])
    is_save_weights = global_conf['is_save_weights']

    # read input config
    input_conf = config['inputs']
    share_input_conf = input_conf['share']

    # prepare the corpus files and reference files for computing BLEU/ROUGE-L metrics
    corpus_file = share_input_conf['corpus_file']
    test_ref_list = read_refs(share_input_conf['test_ref_file'])
    valid_ref_list = read_refs(share_input_conf['valid_ref_file'])
    corpus_dict = {}
    with open(corpus_file) as fin:
        for l in fin:
            tok = l.split(' ')
            corpus_dict[tok[0]] = ' '.join(tok[2:])

    # collect embedding
    if 'embed_path' in share_input_conf:
        embed_dict = read_embedding(filename=share_input_conf['embed_path'])
        _PAD_ = share_input_conf['vocab_size'] - 1
        embed_dict[_PAD_] = np.zeros((share_input_conf['embed_size'], ),
                                     dtype=np.float32)
        embed = np.float32(
            np.random.uniform(-0.2, 0.2, [
                share_input_conf['vocab_size'], share_input_conf['embed_size']
            ]))
        share_input_conf['embed'] = convert_embed_2_numpy(embed_dict,
                                                          embed=embed)
    else:
        embed = np.float32(
            np.random.uniform(-0.2, 0.2, [
                share_input_conf['vocab_size'], share_input_conf['embed_size']
            ]))
        share_input_conf['embed'] = embed
    print '[Embedding] Embedding Load Done.'

    # list all input tags and construct tags config
    input_train_conf = OrderedDict()
    input_eval_conf = OrderedDict()
    for tag in input_conf.keys():
        if 'phase' not in input_conf[tag]:
            continue
        if input_conf[tag]['phase'] == 'TRAIN':
            input_train_conf[tag] = {}
            input_train_conf[tag].update(share_input_conf)
            input_train_conf[tag].update(input_conf[tag])
        elif input_conf[tag]['phase'] == 'EVAL':
            input_eval_conf[tag] = {}
            input_eval_conf[tag].update(share_input_conf)
            input_eval_conf[tag].update(input_conf[tag])
    print '[Input] Process Input Tags. %s in TRAIN, %s in EVAL.' % (
        input_train_conf.keys(), input_eval_conf.keys())

    # collect dataset identification
    dataset = {}
    for tag in input_conf:
        if tag != 'share' and input_conf[tag]['phase'] == 'PREDICT':
            continue
        if 'text1_corpus' in input_conf[tag]:
            datapath = input_conf[tag]['text1_corpus']
            if datapath not in dataset:
                dataset[datapath], _ = read_data(datapath)
        if 'text2_corpus' in input_conf[tag]:
            datapath = input_conf[tag]['text2_corpus']
            if datapath not in dataset:
                dataset[datapath], _ = read_data(datapath)
    print '[Dataset] %s Dataset Load Done.' % len(dataset)

    # initial data generator
    train_gen = OrderedDict()
    eval_gen = OrderedDict()

    for tag, conf in input_train_conf.items():
        print conf
        conf['data1'] = dataset[conf['text1_corpus']]
        conf['data2'] = dataset[conf['text2_corpus']]
        generator = inputs.get(conf['input_type'])
        train_gen[tag] = generator(config=conf)

    for tag, conf in input_eval_conf.items():
        print conf
        conf['data1'] = dataset[conf['text1_corpus']]
        conf['data2'] = dataset[conf['text2_corpus']]
        generator = inputs.get(conf['input_type'])
        eval_gen[tag] = generator(config=conf)

    output_conf = config['outputs']

    ######### Load Model #########
    model = load_model(config)

    loss = []
    for lobj in config['losses']:
        if lobj['object_name'] in mz_specialized_losses:
            loss.append(
                rank_losses.get(lobj['object_name'])(lobj['object_params']))
        else:
            loss.append(rank_losses.get(lobj['object_name']))
    eval_metrics = OrderedDict()
    for mobj in config['metrics']:
        mobj = mobj.lower()
        if '@' in mobj:
            mt_key, mt_val = mobj.split('@', 1)
            eval_metrics[mobj] = metrics.get(mt_key)(int(mt_val))
        else:
            eval_metrics[mobj] = metrics.get(mobj)
    model.compile(optimizer=optimizer, loss=loss)
    print '[Model] Model Compile Done.'

    for i_e in range(num_iters):
        for tag, generator in train_gen.items():
            genfun = generator.get_batch_generator()
            print '[%s]\t[Train:%s]' % (time.strftime(
                '%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag),
            history = model.fit_generator(genfun,
                                          steps_per_epoch=display_interval,
                                          epochs=1,
                                          shuffle=False,
                                          verbose=0)  #callbacks=[eval_map])
            print 'Iter:%d\tloss=%.6f' % (i_e, history.history['loss'][0])

        for tag, generator in eval_gen.items():
            #print('test tag: ', tag)
            genfun = generator.get_batch_generator()
            # print '[%s]\t[Eval:%s]' % (time.strftime('%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag),
            # res = dict([[k,0.] for k in eval_metrics.keys()])
            res_scores = {
            }  # 2D dict; key qid-did ;value: predict_score, ground_truth
            for input_data, y_true in genfun:
                y_pred = model.predict(input_data, batch_size=len(y_true))
                if issubclass(type(generator),
                              inputs.list_generator.ListBasicGenerator):
                    list_counts = input_data[
                        'list_counts']  # list_counts store the boundries between documents under different queries
                    y_pred = np.squeeze(y_pred)
                    for lc_idx in range(len(list_counts) - 1):
                        pre = list_counts[lc_idx]
                        suf = list_counts[lc_idx + 1]
                        for p, y, t in zip(input_data['ID'][pre:suf],
                                           y_pred[pre:suf], y_true[pre:suf]):
                            if p[0] not in res_scores:
                                res_scores[p[0]] = {}
                            res_scores[p[0]][p[1]] = (y, t)
                else:
                    NameError('not supported in this version!')
            generator.reset()
            sys.stdout.flush()
            # save predicted score files for valid/test data
            if (i_e + 1) % save_weights_iters == 0:
                score_list = []
                with open(
                        output_conf['predict']['save_path_during_train'] +
                        '-' + tag + '.' + str(i_e + 1), 'w') as f:
                    for qid, dinfo in res_scores.items():
                        dinfo = sorted(dinfo.items(),
                                       key=lambda d: d[1][0],
                                       reverse=True)
                        for inum, (did, (score, gt)) in enumerate(dinfo):
                            score_l = '%s\tQ0\t%s\t%d\t%f\t%s\t%s' % (
                                qid, did, inum, score, config['net_name'], gt)
                            print >> f, score_l
                            score_list.append(score_l)
                # compute BLEU/ROUGE metrics at this check point
                ref_list = test_ref_list if tag == 'test' else valid_ref_list
                bleu_rouge_metrics = compute_bleu_rouge_given_scores_in_train(
                    score_list, corpus_dict, ref_list, tag)
                print '[%s]\t[Eval:%s] Iter:%d\t(bleu1-4 corpus_bleu rougel dist1 dist2 avglen)\t%s' \
                    % (time.strftime('%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag, i_e+1, bleu_rouge_metrics)

        if (
                i_e + 1
        ) % save_weights_iters and is_save_weights == "1":  # add an option to control saving weight files or not
            model.save_weights(weights_file % (i_e + 1))
예제 #8
0
def train(config):

    print(json.dumps(config, indent=2), end='\n')
    # read basic config
    global_conf = config["global"]
    weights_file = str(global_conf['weights_file']) + '.%d'
    display_interval = int(global_conf['display_interval'])
    num_iters = int(global_conf['num_iters'])
    save_weights_iters = int(global_conf['save_weights_iters'])

    # read input config
    input_conf = config['inputs']
    share_input_conf = input_conf['share']

    # collect embedding
    if 'embed_path' in share_input_conf:
        embed_dict = read_embedding(filename=share_input_conf['embed_path'])
        _PAD_ = share_input_conf['vocab_size'] - 1
        embed_dict[_PAD_] = np.zeros((share_input_conf['embed_size'], ),
                                     dtype=np.float32)
        embed = np.float32(
            np.random.uniform(-0.2, 0.2, [
                share_input_conf['vocab_size'], share_input_conf['embed_size']
            ]))
        share_input_conf['embed'] = convert_embed_2_numpy(embed_dict,
                                                          embed=embed)
    else:
        embed = np.float32(
            np.random.uniform(-0.2, 0.2, [
                share_input_conf['vocab_size'], share_input_conf['embed_size']
            ]))
        share_input_conf['embed'] = embed
    print('[Embedding] Embedding Load Done.', end='\n')

    # list all input tags and construct tags config
    input_train_conf = OrderedDict()
    input_eval_conf = OrderedDict()
    for tag in input_conf.keys():
        if 'phase' not in input_conf[tag]:
            continue
        if input_conf[tag]['phase'] == 'TRAIN':
            input_train_conf[tag] = {}
            input_train_conf[tag].update(share_input_conf)
            input_train_conf[tag].update(input_conf[tag])
        elif input_conf[tag]['phase'] == 'EVAL':
            input_eval_conf[tag] = {}
            input_eval_conf[tag].update(share_input_conf)
            input_eval_conf[tag].update(input_conf[tag])
    print('[Input] Process Input Tags. %s in TRAIN, %s in EVAL.' %
          (input_train_conf.keys(), input_eval_conf.keys()),
          end='\n')

    # collect dataset identification
    dataset = {}
    for tag in input_conf:
        if tag != 'share' and input_conf[tag]['phase'] == 'PREDICT':
            continue
        if 'text1_corpus' in input_conf[tag]:
            datapath = input_conf[tag]['text1_corpus']
            if datapath not in dataset:
                dataset[datapath], _ = read_data(datapath)
        if 'text2_corpus' in input_conf[tag]:
            datapath = input_conf[tag]['text2_corpus']
            if datapath not in dataset:
                dataset[datapath], _ = read_data(datapath)
    print('[Dataset] %s Dataset Load Done.' % len(dataset), end='\n')

    # initial data generator
    train_gen = OrderedDict()
    eval_gen = OrderedDict()

    for tag, conf in input_train_conf.items():
        print(conf, end='\n')
        conf['data1'] = dataset[conf['text1_corpus']]
        conf['data2'] = dataset[conf['text2_corpus']]
        generator = inputs.get(conf['input_type'])
        train_gen[tag] = generator(config=conf)

    for tag, conf in input_eval_conf.items():
        print(conf, end='\n')
        conf['data1'] = dataset[conf['text1_corpus']]
        conf['data2'] = dataset[conf['text2_corpus']]
        generator = inputs.get(conf['input_type'])
        eval_gen[tag] = generator(config=conf)

    ######### Load Model #########
    zmodel, kmodel = load_model(config)

    input = Input(name='input', shape=(2, 50))
    timeDistributed = TimeDistributed(layer=zmodel, input_shape=(2, 50))(input)
    z_knrm_model = Model(input=input, output=timeDistributed)

    eval_metrics = OrderedDict()
    for mobj in config['metrics']:
        mobj = mobj.lower()
        if '@' in mobj:
            mt_key, mt_val = mobj.split('@', 1)
            eval_metrics[mobj] = metrics.get(mt_key)(int(mt_val))
        else:
            eval_metrics[mobj] = metrics.get(mobj)

    epoch_num = 400
    batch_size = 200  # take a look at the config
    batch_num_per_epoch = 10
    #train_as_whole(z_knrm_model, zmodel, train_gen, eval_gen, eval_metrics)
    z_knrm_model.set_tensorboard("/tmp/matchzoo", "knrm-sgd-1e4")
    # train_per_epoch(z_knrm_model, zmodel, train_gen, eval_gen, eval_metrics, optimMethod=SGD(1e-4))

    train_per_epoch(z_knrm_model,
                    zmodel,
                    train_gen,
                    eval_gen,
                    eval_metrics,
                    optimMethod=SGD(1e-4,
                                    leaningrate_schedule=Poly(0.5, 50 * 400)))
예제 #9
0
파일: main.py 프로젝트: hhh920406/MatchZoo
def train(config):

    print(json.dumps(config, indent=2), end='\n')
    # read basic config
    global_conf = config["global"]
    optimizer = global_conf['optimizer']
    optimizer=optimizers.get(optimizer)
    K.set_value(optimizer.lr, global_conf['learning_rate'])
    weights_file = str(global_conf['weights_file']) + '.%d'
    display_interval = int(global_conf['display_interval'])
    num_iters = int(global_conf['num_iters'])
    save_weights_iters = int(global_conf['save_weights_iters'])

    # read input config
    input_conf = config['inputs']
    share_input_conf = input_conf['share']


    # collect embedding
    if 'embed_path' in share_input_conf:
        embed_dict = read_embedding(filename=share_input_conf['embed_path'])
        _PAD_ = share_input_conf['vocab_size'] - 1
        embed_dict[_PAD_] = np.zeros((share_input_conf['embed_size'], ), dtype=np.float32)
        embed = np.float32(np.random.uniform(-0.2, 0.2, [share_input_conf['vocab_size'], share_input_conf['embed_size']]))
        share_input_conf['embed'] = convert_embed_2_numpy(embed_dict, embed = embed)
    else:
        embed = np.float32(np.random.uniform(-0.2, 0.2, [share_input_conf['vocab_size'], share_input_conf['embed_size']]))
        share_input_conf['embed'] = embed
    print('[Embedding] Embedding Load Done.', end='\n')

    # list all input tags and construct tags config
    input_train_conf = OrderedDict()
    input_eval_conf = OrderedDict()
    for tag in input_conf.keys():
        if 'phase' not in input_conf[tag]:
            continue
        if input_conf[tag]['phase'] == 'TRAIN':
            input_train_conf[tag] = {}
            input_train_conf[tag].update(share_input_conf)
            input_train_conf[tag].update(input_conf[tag])
        elif input_conf[tag]['phase'] == 'EVAL':
            input_eval_conf[tag] = {}
            input_eval_conf[tag].update(share_input_conf)
            input_eval_conf[tag].update(input_conf[tag])
    print('[Input] Process Input Tags. %s in TRAIN, %s in EVAL.' % (input_train_conf.keys(), input_eval_conf.keys()), end='\n')

    # collect dataset identification
    dataset = {}
    for tag in input_conf:
        if tag != 'share' and input_conf[tag]['phase'] == 'PREDICT':
            continue
        if 'text1_corpus' in input_conf[tag]:
            datapath = input_conf[tag]['text1_corpus']
            if datapath not in dataset:
                dataset[datapath], _ = read_data(datapath)
        if 'text2_corpus' in input_conf[tag]:
            datapath = input_conf[tag]['text2_corpus']
            if datapath not in dataset:
                dataset[datapath], _ = read_data(datapath)
    print('[Dataset] %s Dataset Load Done.' % len(dataset), end='\n')

    # initial data generator
    train_gen = OrderedDict()
    eval_gen = OrderedDict()

    for tag, conf in input_train_conf.items():
        print(conf, end='\n')
        conf['data1'] = dataset[conf['text1_corpus']]
        conf['data2'] = dataset[conf['text2_corpus']]
        generator = inputs.get(conf['input_type'])
        train_gen[tag] = generator( config = conf )

    for tag, conf in input_eval_conf.items():
        print(conf, end='\n')
        conf['data1'] = dataset[conf['text1_corpus']]
        conf['data2'] = dataset[conf['text2_corpus']]
        generator = inputs.get(conf['input_type'])
        eval_gen[tag] = generator( config = conf )

    ######### Load Model #########
    model = load_model(config)

    loss = []
    for lobj in config['losses']:
        if lobj['object_name'] in mz_specialized_losses:
            loss.append(rank_losses.get(lobj['object_name'])(lobj['object_params']))
        else:
            loss.append(rank_losses.get(lobj['object_name']))
    eval_metrics = OrderedDict()
    for mobj in config['metrics']:
        mobj = mobj.lower()
        if '@' in mobj:
            mt_key, mt_val = mobj.split('@', 1)
            eval_metrics[mobj] = metrics.get(mt_key)(int(mt_val))
        else:
            eval_metrics[mobj] = metrics.get(mobj)
    model.compile(optimizer=optimizer, loss=loss)
    print('[Model] Model Compile Done.', end='\n')

    for i_e in range(num_iters):
        for tag, generator in train_gen.items():
            genfun = generator.get_batch_generator()
            print('[%s]\t[Train:%s] ' % (time.strftime('%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag), end='')
            history = model.fit_generator(
                    genfun,
                    steps_per_epoch = display_interval,
                    epochs = 1,
                    shuffle=False,
                    verbose = 0
                ) #callbacks=[eval_map])
            print('Iter:%d\tloss=%.6f' % (i_e, history.history['loss'][0]), end='\n')

        for tag, generator in eval_gen.items():
            genfun = generator.get_batch_generator()
            print('[%s]\t[Eval:%s] ' % (time.strftime('%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag), end='')
            res = dict([[k,0.] for k in eval_metrics.keys()])
            num_valid = 0
            for input_data, y_true in genfun:
                y_pred = model.predict(input_data, batch_size=len(y_true))
                if issubclass(type(generator), inputs.list_generator.ListBasicGenerator):
                    list_counts = input_data['list_counts']
                    for k, eval_func in eval_metrics.items():
                        for lc_idx in range(len(list_counts)-1):
                            pre = list_counts[lc_idx]
                            suf = list_counts[lc_idx+1]
                            res[k] += eval_func(y_true = y_true[pre:suf], y_pred = y_pred[pre:suf])
                    num_valid += len(list_counts) - 1
                else:
                    for k, eval_func in eval_metrics.items():
                        res[k] += eval_func(y_true = y_true, y_pred = y_pred)
                    num_valid += 1
            generator.reset()
            print('Iter:%d\t%s' % (i_e, '\t'.join(['%s=%f'%(k,v/num_valid) for k, v in res.items()])), end='\n')
            sys.stdout.flush()
        if (i_e+1) % save_weights_iters == 0:
            model.save_weights(weights_file % (i_e+1))
예제 #10
0
파일: main.py 프로젝트: hhh920406/MatchZoo
def predict(config):
    ######## Read input config ########

    print(json.dumps(config, indent=2), end='\n')
    input_conf = config['inputs']
    share_input_conf = input_conf['share']

    # collect embedding
    if 'embed_path' in share_input_conf:
        embed_dict = read_embedding(filename=share_input_conf['embed_path'])
        _PAD_ = share_input_conf['vocab_size'] - 1
        embed_dict[_PAD_] = np.zeros((share_input_conf['embed_size'], ), dtype=np.float32)
        embed = np.float32(np.random.uniform(-0.02, 0.02, [share_input_conf['vocab_size'], share_input_conf['embed_size']]))
        share_input_conf['embed'] = convert_embed_2_numpy(embed_dict, embed = embed)
    else:
        embed = np.float32(np.random.uniform(-0.2, 0.2, [share_input_conf['vocab_size'], share_input_conf['embed_size']]))
        share_input_conf['embed'] = embed
    print('[Embedding] Embedding Load Done.', end='\n')

    # list all input tags and construct tags config
    input_predict_conf = OrderedDict()
    for tag in input_conf.keys():
        if 'phase' not in input_conf[tag]:
            continue
        if input_conf[tag]['phase'] == 'PREDICT':
            input_predict_conf[tag] = {}
            input_predict_conf[tag].update(share_input_conf)
            input_predict_conf[tag].update(input_conf[tag])
    print('[Input] Process Input Tags. %s in PREDICT.' % (input_predict_conf.keys()), end='\n')

    # collect dataset identification
    dataset = {}
    for tag in input_conf:
        if tag == 'share' or input_conf[tag]['phase'] == 'PREDICT':
            if 'text1_corpus' in input_conf[tag]:
                datapath = input_conf[tag]['text1_corpus']
                if datapath not in dataset:
                    dataset[datapath], _ = read_data(datapath)
            if 'text2_corpus' in input_conf[tag]:
                datapath = input_conf[tag]['text2_corpus']
                if datapath not in dataset:
                    dataset[datapath], _ = read_data(datapath)
    print('[Dataset] %s Dataset Load Done.' % len(dataset), end='\n')

    # initial data generator
    predict_gen = OrderedDict()

    for tag, conf in input_predict_conf.items():
        print(conf, end='\n')
        conf['data1'] = dataset[conf['text1_corpus']]
        conf['data2'] = dataset[conf['text2_corpus']]
        generator = inputs.get(conf['input_type'])
        predict_gen[tag] = generator(
                                    #data1 = dataset[conf['text1_corpus']],
                                    #data2 = dataset[conf['text2_corpus']],
                                     config = conf )

    ######## Read output config ########
    output_conf = config['outputs']

    ######## Load Model ########
    global_conf = config["global"]
    weights_file = str(global_conf['weights_file']) + '.' + str(global_conf['test_weights_iters'])

    model = load_model(config)
    model.load_weights(weights_file)

    eval_metrics = OrderedDict()
    for mobj in config['metrics']:
        mobj = mobj.lower()
        if '@' in mobj:
            mt_key, mt_val = mobj.split('@', 1)
            eval_metrics[mobj] = metrics.get(mt_key)(int(mt_val))
        else:
            eval_metrics[mobj] = metrics.get(mobj)
    res = dict([[k,0.] for k in eval_metrics.keys()])

    for tag, generator in predict_gen.items():
        genfun = generator.get_batch_generator()
        print('[%s]\t[Predict] @ %s ' % (time.strftime('%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag), end='')
        num_valid = 0
        res_scores = {}
        for input_data, y_true in genfun:
            y_pred = model.predict(input_data, batch_size=len(y_true) )

            if issubclass(type(generator), inputs.list_generator.ListBasicGenerator):
                list_counts = input_data['list_counts']
                for k, eval_func in eval_metrics.items():
                    for lc_idx in range(len(list_counts)-1):
                        pre = list_counts[lc_idx]
                        suf = list_counts[lc_idx+1]
                        res[k] += eval_func(y_true = y_true[pre:suf], y_pred = y_pred[pre:suf])

                y_pred = np.squeeze(y_pred)
                for lc_idx in range(len(list_counts)-1):
                    pre = list_counts[lc_idx]
                    suf = list_counts[lc_idx+1]
                    for p, y, t in zip(input_data['ID'][pre:suf], y_pred[pre:suf], y_true[pre:suf]):
                        if p[0] not in res_scores:
                            res_scores[p[0]] = {}
                        res_scores[p[0]][p[1]] = (y, t)

                num_valid += len(list_counts) - 1
            else:
                for k, eval_func in eval_metrics.items():
                    res[k] += eval_func(y_true = y_true, y_pred = y_pred)
                for p, y, t in zip(input_data['ID'], y_pred, y_true):
                    if p[0] not in res_scores:
                        res_scores[p[0]] = {}
                    res_scores[p[0]][p[1]] = (y[1], t[1])
                num_valid += 1
        generator.reset()

        if tag in output_conf:
            if output_conf[tag]['save_format'] == 'TREC':
                with open(output_conf[tag]['save_path'], 'w') as f:
                    for qid, dinfo in res_scores.items():
                        dinfo = sorted(dinfo.items(), key=lambda d:d[1][0], reverse=True)
                        for inum,(did, (score, gt)) in enumerate(dinfo):
                            f.write('%s\tQ0\t%s\t%d\t%f\t%s\t%s\n'%(qid, did, inum, score, config['net_name'], gt))
            elif output_conf[tag]['save_format'] == 'TEXTNET':
                with open(output_conf[tag]['save_path'], 'w') as f:
                    for qid, dinfo in res_scores.items():
                        dinfo = sorted(dinfo.items(), key=lambda d:d[1][0], reverse=True)
                        for inum,(did, (score, gt)) in enumerate(dinfo):
                            f.write('%s %s %s %s\n'%(gt, qid, did, score))

        print('[Predict] results: ', '\t'.join(['%s=%f'%(k,v/num_valid) for k, v in res.items()]), end='\n')
        sys.stdout.flush()
예제 #11
0
파일: app.py 프로젝트: florianjacob/try-bro
def bro_metrics_json():
    return cors_jsonify(**metrics.get())
예제 #12
0
파일: app.py 프로젝트: florianjacob/try-bro
def bro_metrics():
    return render_template('metrics.html', metrics=metrics.get())
예제 #13
0
파일: main.py 프로젝트: snowcement/MatchZoo
def train(config):
    #json.dumps()用于将dict类型的数据转成str,因为如果直接将dict类型的数据写入json文件中会发生报错,因此在将数据写入时需要用到该函数
    print(json.dumps(config, indent=2), end='\n')
    # read basic config
    global_conf = config["global"]
    optimizer = global_conf['optimizer']
    optimizer = optimizers.get(optimizer)  #总是调用keras.optimizers
    K.set_value(
        optimizer.lr,
        global_conf['learning_rate'])  #使用 Numpy 数组设置变量的值. lr:learning rate
    weights_file = str(global_conf['weights_file']) + '.%d'
    display_interval = int(global_conf['display_interval'])
    num_iters = int(global_conf['num_iters'])
    save_weights_iters = int(global_conf['save_weights_iters'])

    # read input config
    input_conf = config['inputs']
    share_input_conf = input_conf['share']

    # collect embedding
    if 'embed_path' in share_input_conf:
        embed_dict = read_embedding(filename=share_input_conf['embed_path'])
        _PAD_ = share_input_conf['vocab_size'] - 1
        embed_dict[_PAD_] = np.zeros((share_input_conf['embed_size'], ),
                                     dtype=np.float32)
        embed = np.float32(
            np.random.uniform(-0.2, 0.2, [
                share_input_conf['vocab_size'], share_input_conf['embed_size']
            ]))
        share_input_conf['embed'] = convert_embed_2_numpy(embed_dict,
                                                          embed=embed)
    else:
        embed = np.float32(
            np.random.uniform(-0.2, 0.2, [
                share_input_conf['vocab_size'], share_input_conf['embed_size']
            ]))
        share_input_conf['embed'] = embed
    print('[Embedding] Embedding Load Done.', end='\n')

    # list all input tags and construct tags config
    input_train_conf = OrderedDict()  #使用OrderedDict会根据放入元素的先后顺序进行排序
    input_eval_conf = OrderedDict()
    for tag in input_conf.keys():
        if 'phase' not in input_conf[tag]:
            continue
        if input_conf[tag]['phase'] == 'TRAIN':
            input_train_conf[tag] = {}
            input_train_conf[tag].update(share_input_conf)
            input_train_conf[tag].update(input_conf[tag])
        elif input_conf[tag]['phase'] == 'EVAL':
            input_eval_conf[tag] = {}
            input_eval_conf[tag].update(share_input_conf)
            input_eval_conf[tag].update(input_conf[tag])
    print(
        '[Input] Process Input Tags. %s in TRAIN, %s in EVAL.' %
        (input_train_conf.keys(), input_eval_conf.keys()),
        end='\n'
    )  #odict_keys(['train']) in TRAIN, odict_keys(['valid', 'test']) in EVAL.

    # collect dataset identification
    dataset = {}
    for tag in input_conf:
        if tag != 'share' and input_conf[tag]['phase'] == 'PREDICT':
            continue
        if 'text1_corpus' in input_conf[tag]:
            datapath = input_conf[tag]['text1_corpus']
            if datapath not in dataset:
                dataset[datapath], _ = read_data(datapath)
        if 'text2_corpus' in input_conf[tag]:
            datapath = input_conf[tag]['text2_corpus']
            if datapath not in dataset:
                dataset[datapath], _ = read_data(datapath)
    print('[Dataset] %s Dataset Load Done.' % len(dataset), end='\n')

    # initial data generator
    train_gen = OrderedDict()
    eval_gen = OrderedDict()

    for tag, conf in input_train_conf.items():
        print(conf, end='\n')
        conf['data1'] = dataset[conf['text1_corpus']]
        conf['data2'] = dataset[conf['text2_corpus']]
        generator = inputs.get(conf['input_type'])
        train_gen[tag] = generator(config=conf)

    for tag, conf in input_eval_conf.items():
        print(conf, end='\n')
        conf['data1'] = dataset[conf['text1_corpus']]
        conf['data2'] = dataset[conf['text2_corpus']]
        generator = inputs.get(conf['input_type'])
        eval_gen[tag] = generator(config=conf)

    ######### Load Model #########
    model = load_model(config)

    loss = []
    for lobj in config['losses']:
        if lobj['object_name'] in mz_specialized_losses:
            loss.append(
                rank_losses.get(lobj['object_name'])(lobj['object_params']))
        else:
            loss.append(rank_losses.get(lobj['object_name']))
    eval_metrics = OrderedDict()
    for mobj in config['metrics']:
        mobj = mobj.lower()
        if '@' in mobj:
            mt_key, mt_val = mobj.split('@', 1)
            eval_metrics[mobj] = metrics.get(mt_key)(int(mt_val))
        else:
            eval_metrics[mobj] = metrics.get(mobj)
    model.compile(optimizer=optimizer, loss=loss)
    print('[Model] Model Compile Done.', end='\n')
    #add tensorboard check
    # board = keras.callbacks.TensorBoard(log_dir='../data/toy_example/logs', histogram_freq=0)
    # history = LossHistory()

    for i_e in range(num_iters):  #num_iters类似epochs
        for tag, generator in train_gen.items():
            #genfun生成器生成batch_size*2个样本(一半正样本,一半负样本)
            #display_interval = len(pair_list)//(batch_size*2)
            genfun = generator.get_batch_generator()
            print('[%s]\t[Train:%s] ' % (time.strftime(
                '%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag),
                  end='')
            history = model.fit_generator(
                genfun,
                steps_per_epoch=display_interval,
                epochs=1,
                shuffle=False,
                verbose=0,
                #callbacks = [history, board]
            )  #callbacks=[eval_map])
            print('Iter:%d\tloss=%.6f' % (i_e, history.history['loss'][0]),
                  end='\n')

        for tag, generator in eval_gen.items():
            genfun = generator.get_batch_generator()
            print('[%s]\t[Eval:%s] ' % (time.strftime(
                '%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag),
                  end='')
            res = dict([[k, 0.] for k in eval_metrics.keys()])
            num_valid = 0
            for input_data, y_true in genfun:
                y_pred = model.predict(input_data, batch_size=len(y_true))
                if issubclass(type(generator),
                              inputs.list_generator.ListBasicGenerator):
                    list_counts = input_data['list_counts']
                    for k, eval_func in eval_metrics.items():
                        for lc_idx in range(len(list_counts) - 1):
                            pre = list_counts[lc_idx]
                            suf = list_counts[lc_idx + 1]
                            res[k] += eval_func(y_true=y_true[pre:suf],
                                                y_pred=y_pred[pre:suf])
                    num_valid += len(list_counts) - 1
                else:
                    for k, eval_func in eval_metrics.items():
                        res[k] += eval_func(y_true=y_true, y_pred=y_pred)
                    num_valid += 1
            generator.reset()
            print('Iter:%d\t%s' % (i_e, '\t'.join(
                ['%s=%f' % (k, v / num_valid) for k, v in res.items()])),
                  end='\n')
            sys.stdout.flush()
        if (i_e + 1) % save_weights_iters == 0:
            model.save_weights(weights_file % (i_e + 1))
예제 #14
0
def predict(config):
    ######## Read input config ########

    print(json.dumps(config, indent=2))
    input_conf = config['inputs']
    share_input_conf = input_conf['share']

    # collect embedding
    if 'embed_path' in share_input_conf:
        embed_dict = read_embedding(filename=share_input_conf['embed_path'])
        _PAD_ = share_input_conf['fill_word']
        embed_dict[_PAD_] = np.zeros((share_input_conf['embed_size'], ),
                                     dtype=np.float32)
        embed = np.float32(
            np.random.uniform(-0.02, 0.02, [
                share_input_conf['vocab_size'], share_input_conf['embed_size']
            ]))
        share_input_conf['embed'] = convert_embed_2_numpy(embed_dict,
                                                          embed=embed)
    else:
        embed = np.float32(
            np.random.uniform(-0.2, 0.2, [
                share_input_conf['vocab_size'], share_input_conf['embed_size']
            ]))
        share_input_conf['embed'] = embed
    print '[Embedding] Embedding Load Done.'

    # list all input tags and construct tags config
    input_predict_conf = OrderedDict()
    for tag in input_conf.keys():
        if 'phase' not in input_conf[tag]:
            continue
        if input_conf[tag]['phase'] == 'PREDICT':
            input_predict_conf[tag] = {}
            input_predict_conf[tag].update(share_input_conf)
            input_predict_conf[tag].update(input_conf[tag])
    print '[Input] Process Input Tags. %s in PREDICT.' % (
        input_predict_conf.keys())

    # collect dataset identification
    dataset = {}
    for tag in input_conf:
        if tag == 'share' or input_conf[tag]['phase'] == 'PREDICT':
            if 'text1_corpus' in input_conf[tag]:
                datapath = input_conf[tag]['text1_corpus']
                if datapath not in dataset:
                    dataset[datapath], _ = read_data(datapath)
            if 'text2_corpus' in input_conf[tag]:
                datapath = input_conf[tag]['text2_corpus']
                if datapath not in dataset:
                    dataset[datapath], _ = read_data(datapath)
    print '[Dataset] %s Dataset Load Done.' % len(dataset)

    # initial data generator
    predict_gen = OrderedDict()

    for tag, conf in input_predict_conf.items():
        print conf
        conf['data1'] = dataset[conf['text1_corpus']]
        conf['data2'] = dataset[conf['text2_corpus']]
        generator = inputs.get(conf['input_type'])
        predict_gen[tag] = generator(
            #data1 = dataset[conf['text1_corpus']],
            #data2 = dataset[conf['text2_corpus']],
            config=conf)

    ######## Read output config ########
    output_conf = config['outputs']

    ######## Load Model ########
    global_conf = config["global"]
    weights_file = global_conf['weights_file']

    model = load_model(config)
    model.load_weights(weights_file)

    eval_metrics = OrderedDict()
    for mobj in config['metrics']:
        mobj = mobj.lower()
        if '@' in mobj:
            mt_key, mt_val = mobj.split('@', 1)
            eval_metrics[mobj] = metrics.get(mt_key)(int(mt_val))
        else:
            eval_metrics[mobj] = metrics.get(mobj)
    res = dict([[k, 0.] for k in eval_metrics.keys()])

    for tag, generator in predict_gen.items():
        genfun = generator.get_batch_generator()
        print '[Predict] @ %s ' % tag,
        num_valid = 0
        res_scores = {}
        for input_data, y_true in genfun:
            list_counts = input_data['list_counts']
            y_pred = model.predict(input_data, batch_size=len(y_true))

            for k, eval_func in eval_metrics.items():
                for lc_idx in range(len(list_counts) - 1):
                    pre = list_counts[lc_idx]
                    suf = list_counts[lc_idx + 1]
                    res[k] += eval_func(y_true=y_true[pre:suf],
                                        y_pred=y_pred[pre:suf])

            y_pred = np.squeeze(y_pred)
            for lc_idx in range(len(list_counts) - 1):
                pre = list_counts[lc_idx]
                suf = list_counts[lc_idx + 1]
                for p, y, t in zip(input_data['ID'][pre:suf], y_pred[pre:suf],
                                   y_true[pre:suf]):
                    if p[0] not in res_scores:
                        res_scores[p[0]] = {}
                    res_scores[p[0]][p[1]] = (y, t)

            num_valid += len(list_counts) - 1
        generator.reset()

        if tag in output_conf:
            if output_conf[tag]['save_format'] == 'TREC':
                with open(output_conf[tag]['save_path'], 'w') as f:
                    for qid, dinfo in res_scores.items():
                        dinfo = sorted(dinfo.items(),
                                       key=lambda d: d[1][0],
                                       reverse=True)
                        for inum, (did, (score, gt)) in enumerate(dinfo):
                            print >> f, '%s\tQ0\t%s\t%d\t%f\t%s' % (
                                qid, did, inum, score, config['net_name'])
            elif output_conf[tag]['save_format'] == 'TEXTNET':
                with open(output_conf[tag]['save_path'], 'w') as f:
                    for qid, dinfo in res_scores.items():
                        dinfo = sorted(dinfo.items(),
                                       key=lambda d: d[1][0],
                                       reverse=True)
                        for inum, (did, (score, gt)) in enumerate(dinfo):
                            print >> f, '%s %s %s %s' % (gt, qid, did, score)

        print '[Predict] results: ', '  '.join(
            ['%s:%f' % (k, v / num_valid) for k, v in res.items()])
        sys.stdout.flush()
예제 #15
0
    def __init__(self,
                 model_path,
                 batch_size,
                 epoch_num,
                 lr,
                 keep_rate,
                 seq_len=None,
                 net_params=None,
                 origin_file=None,
                 files_split=None,
                 match_model="bert",
                 vec_models=None,
                 engine=None,
                 recall_num=5,
                 eval_metrics=None):
        '''
        :param model_path: 模型存放路径
        :param origin_file: 原始文本-类别文件/问法-问题
        :param files_split: 处理好的训练集和测试集
        :param match_model: 排序/匹配模型
        :param vec_models: 特征提取器(list,可插拔)
        :param engine: 检索引擎
        :param eval_metrics: 评价指标(list,可插拔)
        '''
        self.r_state = 666
        assert isinstance(vec_models, list) and len(vec_models) > 0
        if match_model not in nets_dict:
            raise ValueError("valid model must in: {}".format(" ".join(
                [k for k in nets_dict.keys()])))
        self.model_name = match_model
        self.model_path = model_path.format(m=match_model)
        if not os.path.exists(self.model_path):
            os.mkdir(self.model_path)
        self.max_models_num = 7
        self.net_params = net_params
        self.ignore_std_queries = ["others"]

        self.featurizers = []
        for m in vec_models:
            if isinstance(m, list):
                self.featurizers += m
            else:
                self.featurizers.append(m)
        self.engine = engine

        if eval_metrics:
            assert isinstance(eval_metrics, list) and len(eval_metrics) > 0
            self.eval_metrics = OrderedDict()
            for mobj in eval_metrics:
                mobj = mobj.lower()
                if '@' in mobj:
                    mt_key, mt_val = mobj.split('@', 1)
                    self.eval_metrics[mobj] = metrics.get(mt_key)(int(mt_val))
                else:
                    self.eval_metrics[mobj] = metrics.get(mobj)

        self.origin_file, self.files_split = origin_file, files_split
        self.sep = "\t"
        self.query_col, self.label_col = "question", "qid"
        self.train_rate = 0.7  # 训练测试数据分布
        self.key_cols = [self.label_col, self.query_col]

        self.per_docs_num = recall_num
        self.input_cols = ["text_{}".format(i) for i in range(2)]
        self.num_feed_x = len(self.featurizers) * len(self.input_cols)
        self.model_col = "label"
        self.sample_dist = 1.0  # 训练集标签分布
        self.cols = self.input_cols + [self.model_col]

        self.query2qid, self.train_num, self.eva_num = None, None, None

        self.seq_len = seq_len
        self.num_class = 2
        self.keep_rate = keep_rate
        self.batch_size = batch_size
        self.epoch_num = epoch_num
        self.lr = lr
        self.tf_dtypes, self.np_dtypes = [], []
        for m in self.featurizers:
            self.tf_dtypes += [
                tf.float32 if len(m.output_shape) > 2 else tf.int32
            ] * len(self.input_cols)
            self.np_dtypes += [
                np.float32 if len(m.output_shape) > 2 else np.int32
            ] * len(self.input_cols)
        self.tf_dtypes.append(tf.int32)
        self.np_dtypes.append(np.int32)
        self.input_tensors = []
        self.test_y, self.keep_prob = None, None
        self.define_tensor()
        self.net_loss, self.one_hot_labels, self.pred_prob = None, None, None
        self.net_init()

        self.session = None
        model_save = tf.train.get_checkpoint_state(self.model_path)
        if model_save and model_save.model_checkpoint_path:
            print("Loading matching model...")
            # tf.reset_default_graph()
            self.session = tf.Session()
            self.session.run(tf.global_variables_initializer())
            saver = tf.train.Saver()
            try:
                saver.restore(self.session, model_save.model_checkpoint_path)
                print("Rank model is ready")
            except:
                print("Load rank model Failed !!")
        else:
            print("Rank model not exists")
예제 #16
0
def predict(config):
    ######## Read input config ########

    print(json.dumps(config, indent=2))
    input_conf = config['inputs']
    share_input_conf = input_conf['share']

    # collect embedding
    if 'embed_path' in share_input_conf:
        embed_dict = read_embedding(filename=share_input_conf['embed_path'])
        _PAD_ = share_input_conf['vocab_size'] - 1
        embed_dict[_PAD_] = np.zeros((share_input_conf['embed_size'], ), dtype=np.float32)
        embed = np.float32(np.random.uniform(-0.02, 0.02, [share_input_conf['vocab_size'], share_input_conf['embed_size']]))
        share_input_conf['embed'] = convert_embed_2_numpy(embed_dict, embed = embed)
    else:
        embed = np.float32(np.random.uniform(-0.2, 0.2, [share_input_conf['vocab_size'], share_input_conf['embed_size']]))
        share_input_conf['embed'] = embed
    print '[Embedding] Embedding Load Done.'

    # list all input tags and construct tags config
    input_predict_conf = OrderedDict()
    for tag in input_conf.keys():
        if 'phase' not in input_conf[tag]:
            continue
        if input_conf[tag]['phase'] == 'PREDICT':
            input_predict_conf[tag] = {}
            input_predict_conf[tag].update(share_input_conf)
            input_predict_conf[tag].update(input_conf[tag])
    print '[Input] Process Input Tags. %s in PREDICT.' % (input_predict_conf.keys())

    # collect dataset identification
    dataset = {}
    for tag in input_conf:
        if tag == 'share' or input_conf[tag]['phase'] == 'PREDICT':
            if 'text1_corpus' in input_conf[tag]:
                datapath = input_conf[tag]['text1_corpus']
                if datapath not in dataset:
                    dataset[datapath] = read_data_2d(datapath)
            if 'text2_corpus' in input_conf[tag]:
                datapath = input_conf[tag]['text2_corpus']
                if datapath not in dataset:
                    dataset[datapath] = read_data_2d(datapath)
            if 'qa_comat_file' in input_conf[tag]:  # qa_comat_file for qa_cooccur_matrix in DMN_KD_CQA and DMN_KD_Web
                datapath = input_conf[tag]['qa_comat_file']
                if datapath not in dataset:
                    dataset[datapath] = read_qa_comat(datapath)
    print '[Dataset] %s Dataset Load Done.' % len(dataset)

    # initial data generator
    predict_gen = OrderedDict()

    for tag, conf in input_predict_conf.items():
        print conf
        conf['data1'] = dataset[conf['text1_corpus']]
        conf['data2'] = dataset[conf['text2_corpus']]
        if 'qa_comat_file' in share_input_conf:
            conf['qa_comat'] = dataset[conf['qa_comat_file']]
        generator = inputs.get(conf['input_type'])
        predict_gen[tag] = generator(
                                    #data1 = dataset[conf['text1_corpus']],
                                    #data2 = dataset[conf['text2_corpus']],
                                     config = conf )

    ######## Read output config ########
    output_conf = config['outputs']

    ######## Load Model ########
    global_conf = config["global"]
    weights_file = str(global_conf['weights_file']) + '.' + str(global_conf['test_weights_iters']) + '-' + str(seed)

    if config['net_name'] == 'DMN_CNN_MTL':
        model, model_clf = load_model(config)
        model.load_weights(weights_file)
    elif config['net_name'] == 'DMN_CNN_INTENTS':
        model_clf = load_model(config)
        model_clf.load_weights(weights_file)
    elif config['net_name'] == 'DMN_CNN_MTL_Web' or config['net_name'] == 'DMN_CNN_MTL_Web_v2':
        model, model_web = load_model(config)
        model.load_weights(weights_file)
        weights_file_web = str(global_conf['weights_file_web']) + '.' + str(global_conf['test_weights_iters']) + '-' + str(seed)
        model_web.load_weights(weights_file_web)
    elif config['net_name'] == 'DMN_CNN_MTL_All':
        model, model_web, model_clf = load_model(config)
        model.load_weights(weights_file)
        weights_file_web = str(global_conf['weights_file_web']) + '.' + str(
            global_conf['test_weights_iters']) + '-' + str(seed)
        model_web.load_weights(weights_file_web)
    else:
        model = load_model(config)
        model.load_weights(weights_file)

    eval_metrics = OrderedDict()
    for mobj in config['metrics']:
        mobj = mobj.lower()
        if '@' in mobj:
            mt_key, mt_val = mobj.split('@', 1)
            eval_metrics[mobj] = metrics.get(mt_key)(int(mt_val))
        else:
            eval_metrics[mobj] = metrics.get(mobj)

    res = dict([[k,0.] for k in eval_metrics.keys()])

    for tag, generator in predict_gen.items():
        genfun = generator.get_batch_generator()
        print '[%s]\t[Predict] @ %s ' % (time.strftime('%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag),
        num_valid = 0
        res_scores = {}

        if tag == 'predict':
            model_to_evaluate = model
        elif tag == 'predict_clf':
            model_to_evaluate = model_clf
        elif tag == 'predict_web':
            model_to_evaluate = model_web

        for input_data, y_true in genfun:
            y_pred = model_to_evaluate.predict(input_data, batch_size=len(y_true))
            
            if tag == 'predict_clf':
                y_pred = np.argmax(y_pred, axis=1)

            if issubclass(type(generator), inputs.list_generator.ListBasicGenerator):
                list_counts = input_data['list_counts']
                for k, eval_func in eval_metrics.items():
                    for lc_idx in range(len(list_counts)-1):
                        pre = list_counts[lc_idx]
                        suf = list_counts[lc_idx+1]
                        res[k] += eval_func(y_true = y_true[pre:suf], y_pred = y_pred[pre:suf])

                y_pred = np.squeeze(y_pred)
                for lc_idx in range(len(list_counts) - 1):
                    pre = list_counts[lc_idx]
                    suf = list_counts[lc_idx + 1]
                    for p, y, t in zip(input_data['ID'][pre:suf], y_pred[pre:suf], y_true[pre:suf]):
                        if tag == 'predict_clf':
                            res_scores[p[0]] = (y, t)
                        else:
                            if p[0] not in res_scores:
                                res_scores[p[0]] = {}
                            res_scores[p[0]][p[1]] = (y, t)

                num_valid += len(list_counts) - 1
            else:
                for k, eval_func in eval_metrics.items():
                    res[k] += eval_func(y_true = y_true, y_pred = y_pred)
                for p, y, t in zip(input_data['ID'], y_pred, y_true):
                    if p[0] not in res_scores:
                        res_scores[p[0]] = {}
                    res_scores[p[0]][p[1]] = (y[1], t[1])
                num_valid += 1
        generator.reset()

        if tag in output_conf:
            save_path = output_conf[tag]['save_path'] + '-' + str(seed)

            if output_conf[tag]['save_format'] == 'TREC':
                with open(save_path, 'w') as f:
                    if tag == 'predict_clf':
                        for qid, entry in res_scores.items():
                            print >> f, '%s\t%d\t%d'%(qid, entry[0], entry[1])
                    else:
                        for qid, dinfo in res_scores.items():
                            dinfo = sorted(dinfo.items(), key=lambda d:d[1][0], reverse=True)
                            for inum,(did, (score, gt)) in enumerate(dinfo):
                                print >> f, '%s\tQ0\t%s\t%d\t%f\t%s\t%s'%(qid, did, inum, score, config['net_name'], gt)
            elif output_conf[tag]['save_format'] == 'TEXTNET':
                with open(save_path, 'w') as f:
                    for qid, dinfo in res_scores.items():
                        dinfo = sorted(dinfo.items(), key=lambda d:d[1][0], reverse=True)
                        for inum,(did, (score, gt)) in enumerate(dinfo):
                            print >> f, '%s %s %s %s'%(gt, qid, did, score)

        print '[Predict] results: ', '\t'.join(['%s=%f'%(k,v/num_valid) for k, v in res.items()])
        sys.stdout.flush()
예제 #17
0
def train(config):

    print(json.dumps(config, indent=2))
    # read basic config
    global_conf = config["global"]
    optimizer = global_conf['optimizer']
    weights_file = str(global_conf['weights_file']) + '.%d'
    display_interval = int(global_conf['display_interval'])
    num_iters = int(global_conf['num_iters'])
    save_weights_iters = int(global_conf['save_weights_iters'])

    # read input config
    input_conf = config['inputs']
    share_input_conf = input_conf['share']

    # collect embedding
    if 'embed_path' in share_input_conf:
        embed_dict = read_embedding(filename=share_input_conf['embed_path'])
        _PAD_ = share_input_conf['vocab_size'] - 1
        embed_dict[_PAD_] = np.zeros((share_input_conf['embed_size'], ),
                                     dtype=np.float32)
        embed = np.float32(
            np.random.uniform(-0.2, 0.2, [
                share_input_conf['vocab_size'], share_input_conf['embed_size']
            ]))
        share_input_conf['embed'] = convert_embed_2_numpy(embed_dict,
                                                          embed=embed)
    else:
        embed = np.float32(
            np.random.uniform(-0.2, 0.2, [
                share_input_conf['vocab_size'], share_input_conf['embed_size']
            ]))
        share_input_conf['embed'] = embed
    print '[Embedding] Embedding Load Done.'

    # list all input tags and construct tags config
    input_train_conf = OrderedDict()
    input_eval_conf = OrderedDict()
    for tag in input_conf.keys():
        if 'phase' not in input_conf[tag]:
            continue
        if input_conf[tag]['phase'] == 'TRAIN':
            input_train_conf[tag] = {}
            input_train_conf[tag].update(share_input_conf)
            input_train_conf[tag].update(input_conf[tag])
        elif input_conf[tag]['phase'] == 'EVAL':
            input_eval_conf[tag] = {}
            input_eval_conf[tag].update(share_input_conf)
            input_eval_conf[tag].update(input_conf[tag])
    print '[Input] Process Input Tags. %s in TRAIN, %s in EVAL.' % (
        input_train_conf.keys(), input_eval_conf.keys())

    # collect dataset identification
    dataset = {}
    for tag in input_conf:
        if tag != 'share' and input_conf[tag]['phase'] == 'PREDICT':
            continue
        if 'text1_corpus' in input_conf[tag]:
            datapath = input_conf[tag]['text1_corpus']
            if datapath not in dataset:
                dataset[datapath], _ = read_data(datapath)
        if 'text2_corpus' in input_conf[tag]:
            datapath = input_conf[tag]['text2_corpus']
            if datapath not in dataset:
                dataset[datapath], _ = read_data(datapath)
    print '[Dataset] %s Dataset Load Done.' % len(dataset)

    # initial data generator
    train_gen = OrderedDict()
    eval_gen = OrderedDict()

    for tag, conf in input_train_conf.items():
        print conf
        conf['data1'] = dataset[conf['text1_corpus']]
        conf['data2'] = dataset[conf['text2_corpus']]
        generator = inputs.get(conf['input_type'])
        train_gen[tag] = generator(config=conf)

    for tag, conf in input_eval_conf.items():
        print conf
        conf['data1'] = dataset[conf['text1_corpus']]
        conf['data2'] = dataset[conf['text2_corpus']]
        generator = inputs.get(conf['input_type'])
        eval_gen[tag] = generator(config=conf)

    ######### Load Model #########
    model = load_model(config)

    loss = []
    for lobj in config['losses']:
        if lobj['object_name'] in mz_specialized_losses:
            loss.append(
                rank_losses.get(lobj['object_name'])(lobj['object_params']))
        else:
            loss.append(rank_losses.get(lobj['object_name']))
    eval_metrics = OrderedDict()
    for mobj in config['metrics']:
        mobj = mobj.lower()
        if '@' in mobj:
            mt_key, mt_val = mobj.split('@', 1)
            eval_metrics[mobj] = metrics.get(mt_key)(int(mt_val))
        else:
            eval_metrics[mobj] = metrics.get(mobj)
    model.compile(optimizer=optimizer, loss=loss)
    print '[Model] Model Compile Done.'

    for i_e in range(num_iters):
        for tag, generator in train_gen.items():
            genfun = generator.get_batch_generator()
            print '[%s]\t[Train:%s]' % (time.strftime(
                '%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag),
            history = model.fit_generator(genfun,
                                          steps_per_epoch=display_interval,
                                          epochs=1,
                                          shuffle=False,
                                          verbose=0)  #callbacks=[eval_map])
            print 'Iter:%d\tloss=%.6f' % (i_e, history.history['loss'][0])

        for tag, generator in eval_gen.items():
            genfun = generator.get_batch_generator()
            print '[%s]\t[Eval:%s]' % (time.strftime(
                '%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag),
            res = dict([[k, 0.] for k in eval_metrics.keys()])
            num_valid = 0
            for input_data, y_true in genfun:
                y_pred = model.predict(input_data, batch_size=len(y_true))
                if issubclass(type(generator),
                              inputs.list_generator.ListBasicGenerator):
                    list_counts = input_data['list_counts']
                    for k, eval_func in eval_metrics.items():
                        for lc_idx in range(len(list_counts) - 1):
                            pre = list_counts[lc_idx]
                            suf = list_counts[lc_idx + 1]
                            res[k] += eval_func(y_true=y_true[pre:suf],
                                                y_pred=y_pred[pre:suf])
                    num_valid += len(list_counts) - 1
                else:
                    for k, eval_func in eval_metrics.items():
                        res[k] += eval_func(y_true=y_true, y_pred=y_pred)
                    num_valid += 1
            generator.reset()
            print 'Iter:%d\t%s' % (i_e, '\t'.join(
                ['%s=%f' % (k, v / num_valid) for k, v in res.items()]))
            sys.stdout.flush()
        if (i_e + 1) % save_weights_iters == 0:
            model.save_weights(weights_file % (i_e + 1))
예제 #18
0
def train(config):

    if seed is None:
        raise Exception('Seed should be set')
    print('Using seed: ' + str(seed))
    # read basic config
    global_conf = config["global"]
    learning_rate = global_conf['learning_rate']
    use_existing_weights = global_conf['use_existing_weights'] if 'use_existing_weights' in global_conf else None
    optimizer = Adam(lr=learning_rate)
    weights_file = str(global_conf['weights_file']) + '.%d'
    weights_file_web = str(global_conf['weights_file_web']) + '.%d' if 'weights_file_web' in global_conf else None
    display_interval = int(global_conf['display_interval'])
    num_iters = int(global_conf['num_iters'])
    save_weights_iters = int(global_conf['save_weights_iters'])

    # read input config
    input_conf = config['inputs']
    share_input_conf = input_conf['share']

    # collect embedding
    if 'embed_path' in share_input_conf:
        embed_dict = read_embedding(filename=share_input_conf['embed_path'])
        _PAD_ = share_input_conf['vocab_size'] - 1
        embed_dict[_PAD_] = np.zeros((share_input_conf['embed_size'], ), dtype=np.float32)
        embed = np.float32(np.random.uniform(-0.2, 0.2, [share_input_conf['vocab_size'], share_input_conf['embed_size']]))
        share_input_conf['embed'] = convert_embed_2_numpy(embed_dict, embed = embed)
    else:
        embed = np.float32(np.random.uniform(-0.2, 0.2, [share_input_conf['vocab_size'], share_input_conf['embed_size']]))
        share_input_conf['embed'] = embed
    print '[Embedding] Embedding Load Done.'

    # list all input tags and construct tags config
    input_train_conf = OrderedDict()
    input_eval_conf = OrderedDict()
    for tag in input_conf.keys():
        if 'phase' not in input_conf[tag]:
            continue

        if input_conf[tag]['phase'] == 'TRAIN':
            input_train_conf[tag] = {}
            input_train_conf[tag].update(share_input_conf)
            input_train_conf[tag].update(input_conf[tag])
        elif input_conf[tag]['phase'] == 'EVAL':
            input_eval_conf[tag] = {}
            input_eval_conf[tag].update(share_input_conf)
            input_eval_conf[tag].update(input_conf[tag])

    # collect dataset identification
    dataset = {}
    for tag in input_conf:
        if tag != 'share' and input_conf[tag]['phase'] == 'PREDICT':
            continue
        if 'text1_corpus' in input_conf[tag]:
            datapath = input_conf[tag]['text1_corpus']
            if datapath not in dataset:
                dataset[datapath] = read_data_2d(datapath)
        if 'text2_corpus' in input_conf[tag]:
            datapath = input_conf[tag]['text2_corpus']
            if datapath not in dataset:
                dataset[datapath] = read_data_2d(datapath)
        if 'qa_comat_file' in input_conf[tag]: # qa_comat_file for qa_cooccur_matrix in DMN_KD
            datapath = input_conf[tag]['qa_comat_file']
            if datapath not in dataset:
                dataset[datapath] = read_qa_comat(datapath)

    print '[Dataset] %s Dataset Load Done.' % len(dataset)

    # initial data generator
    train_gen = OrderedDict()
    eval_gen = OrderedDict()

    for tag, conf in input_train_conf.items():
        print conf
        conf['data1'] = dataset[conf['text1_corpus']]
        conf['data2'] = dataset[conf['text2_corpus']]
        if 'qa_comat_file' in share_input_conf:
            conf['qa_comat'] = dataset[conf['qa_comat_file']]

        generator = inputs.get(conf['input_type'])
        train_gen[tag] = generator(config=conf)

    for tag, conf in input_eval_conf.items():
        print conf
        conf['data1'] = dataset[conf['text1_corpus']]
        conf['data2'] = dataset[conf['text2_corpus']]

        if 'qa_comat_file' in share_input_conf:
            conf['qa_comat'] = dataset[conf['qa_comat_file']]

        generator = inputs.get(conf['input_type'])
        eval_gen[tag] = generator(config=conf)

    ######### Load Model #########

    if config['net_name'] == 'DMN_CNN_MTL':
        model, model_clf = load_model(config)
        if use_existing_weights:
            weights_file_to_load = str(global_conf['weights_file']) + '.' + str(
                global_conf['weights_to_load']) + '-' + str(seed)
            model.load_weights(weights_file_to_load)

        model_clf.compile(optimizer=optimizer, loss=custom_loss)
        print '[Model] MTL models Compile Done.'
    elif config['net_name'] == 'DMN_CNN_INTENTS':
        model_clf = load_model(config)
        model_clf.compile(optimizer=optimizer, loss=custom_loss)
        print '[Model] Intent Only classifier model Compile Done.'
    elif config['net_name'] == 'DMN_CNN_MTL_Web' or config['net_name'] == 'DMN_CNN_MTL_Web_v2':
        model, model_web = load_model(config)
        if use_existing_weights:
            weights_file_to_load = str(global_conf['weights_file']) + '.' + str(
                global_conf['weights_to_load']) + '-' + str(seed)
            model.load_weights(weights_file_to_load)
            weights_file_web = str(global_conf['weights_file_web']) + '.' + str(
                global_conf['weights_to_load']) + '-' + str(seed)
            model_web.load_weights(weights_file_web)
    elif config['net_name'] == 'DMN_CNN_MTL_All':
        model, model_web, model_clf = load_model(config)
        model_clf.compile(optimizer=optimizer, loss=custom_loss)
    else:
        model = load_model(config)
        if use_existing_weights:
            weights_file_to_load = str(global_conf['weights_file']) + '.' + str(
                global_conf['test_weights_iters']) + '-' + str(seed)
            model.load_weights(weights_file_to_load)

        print '[Model] Response Ranking model Compile Done.'

    eval_metrics = OrderedDict()
    for mobj in config['metrics']:
        mobj = mobj.lower()
        if '@' in mobj:
            mt_key, mt_val = mobj.split('@', 1)
            eval_metrics[mobj] = metrics.get(mt_key)(int(mt_val))
        else:
            eval_metrics[mobj] = metrics.get(mobj)

    if config['net_name'] != 'DMN_CNN_INTENTS':
        loss = []
        for lobj in config['losses']:
            if lobj['object_name'] in mz_specialized_losses:
                loss.append(rank_losses.get(lobj['object_name'])(lobj['object_params']))
            else:
                loss.append(rank_losses.get(lobj['object_name']))

        model.compile(optimizer=optimizer, loss=loss)
        print '[Model] Model Compile Done.'

        if config['net_name'] == 'DMN_CNN_MTL_Web' or config['net_name'] == 'DMN_CNN_MTL_Web_v2' \
                or config['net_name'] == 'DMN_CNN_MTL_All':
            model_web.compile(optimizer=optimizer, loss=loss)
            print('[Model Web] Model Compile Done')

    if share_input_conf['predict'] == 'False':
        if 'test' in eval_gen:
            del eval_gen['test']
        if 'valid' in eval_gen:
            del eval_gen['valid']
        if 'eval_predict_in' in eval_gen:
            del eval_gen['eval_predict_in']

    for i_e in range(num_iters):
        for tag, generator in train_gen.items():
            genfun = generator.get_batch_generator()
            print '[%s]\t[Train:%s]' % (time.strftime('%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag),

            if tag == "train_clf":
                correct_model = model_clf
            elif tag == 'train_web':
                correct_model = model_web
            elif tag == "train":
                correct_model = model

            history = correct_model.fit_generator(
                genfun,
                steps_per_epoch=display_interval,  # if display_interval = 10, then there are 10 batches in 1 epoch
                epochs=1,
                shuffle=False,
                verbose=0)

            print 'Iter:%d\tloss=%.6f' % (i_e, history.history['loss'][0])

        if (i_e+1) % save_weights_iters == 0:
            for tag, generator in eval_gen.items():
                print('Evaluating tag:' + str(tag))
                genfun = generator.get_batch_generator()
                print '[%s]\t[Eval:%s]' % (time.strftime('%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag),
                res = dict([[k,0.] for k in eval_metrics.keys()])
                num_valid = 0

                if tag == "valid":
                    correct_model = model
                elif tag == "valid_web":
                    correct_model = model_web
                elif tag == "valid_clf":
                    correct_model = model_clf

                for input_data, y_true in genfun:
                    y_pred = correct_model.predict(input_data, batch_size=len(y_true))
                    if issubclass(type(generator), inputs.list_generator.ListBasicGenerator):
                        list_counts = input_data['list_counts']
                        for k, eval_func in eval_metrics.items():
                            for lc_idx in range(len(list_counts)-1):
                                pre = list_counts[lc_idx]
                                suf = list_counts[lc_idx+1]
                                res[k] += eval_func(y_true = y_true[pre:suf], y_pred = y_pred[pre:suf])
                        num_valid += len(list_counts) - 1
                    else:
                        for k, eval_func in eval_metrics.items():
                            res[k] += eval_func(y_true = y_true, y_pred = y_pred)
                        num_valid += 1
                generator.reset()
                print 'Iter:%d\t%s' % (i_e, '\t'.join(['%s=%f'%(k,v/num_valid) for k, v in res.items()]))
                sys.stdout.flush()

        sys.stdout.flush()

        weights_file_name = (weights_file % (i_e+1)) + '-' + str(seed)
        if (i_e+1) % save_weights_iters == 0:
            if config['net_name'] == 'DMN_CNN_MTL_Web' or config['net_name'] == 'DMN_CNN_MTL_Web_v2'  \
                    or config['net_name'] == 'DMN_CNN_MTL_All':
                weights_file_name_web = (weights_file_web % (i_e + 1)) + '-' + str(seed)
                model.save_weights(weights_file_name)
                model_web.save_weights(weights_file_name_web)
            elif config['net_name'] != 'DMN_CNN_INTENTS':
                model.save_weights(weights_file_name)
            else:
                model_clf.save_weights(weights_file_name)
예제 #19
0
def predict(config):
    ######## Read input config ########

    print(json.dumps(config, indent=2), end='\n')
    input_conf = config['inputs']
    share_input_conf = input_conf['share']

    # collect embedding
    if 'embed_path' in share_input_conf:
        embed_dict = read_embedding(filename=share_input_conf['embed_path'])
        _PAD_ = share_input_conf['vocab_size'] - 1
        embed_dict[_PAD_] = np.zeros((share_input_conf['embed_size'], ),
                                     dtype=np.float32)
        embed = np.float32(
            np.random.uniform(-0.02, 0.02, [
                share_input_conf['vocab_size'], share_input_conf['embed_size']
            ]))
        share_input_conf['embed'] = convert_embed_2_numpy(embed_dict,
                                                          embed=embed)
    else:
        embed = np.float32(
            np.random.uniform(-0.2, 0.2, [
                share_input_conf['vocab_size'], share_input_conf['embed_size']
            ]))
        share_input_conf['embed'] = embed
    print('[Embedding] Embedding Load Done.', end='\n')

    # list all input tags and construct tags config
    input_predict_conf = OrderedDict()
    for tag in input_conf.keys():
        if 'phase' not in input_conf[tag]:
            continue
        if input_conf[tag]['phase'] == 'PREDICT':
            input_predict_conf[tag] = {}
            input_predict_conf[tag].update(share_input_conf)
            input_predict_conf[tag].update(input_conf[tag])
    print('[Input] Process Input Tags. %s in PREDICT.' %
          (input_predict_conf.keys()),
          end='\n')

    # collect dataset identification
    dataset = {}
    for tag in input_conf:
        if tag == 'share' or input_conf[tag]['phase'] == 'PREDICT':
            if 'text1_corpus' in input_conf[tag]:
                datapath = input_conf[tag]['text1_corpus']
                if datapath not in dataset:
                    dataset[datapath], _ = read_data(datapath)
            if 'text2_corpus' in input_conf[tag]:
                datapath = input_conf[tag]['text2_corpus']
                if datapath not in dataset:
                    dataset[datapath], _ = read_data(datapath)
    print('[Dataset] %s Dataset Load Done.' % len(dataset), end='\n')

    # initial data generator
    predict_gen = OrderedDict()

    for tag, conf in input_predict_conf.items():
        print(conf, end='\n')
        conf['data1'] = dataset[conf['text1_corpus']]
        conf['data2'] = dataset[conf['text2_corpus']]
        generator = inputs.get(conf['input_type'])
        predict_gen[tag] = generator(
            #data1 = dataset[conf['text1_corpus']],
            #data2 = dataset[conf['text2_corpus']],
            config=conf)

    ######## Read output config ########
    output_conf = config['outputs']

    ######## Load Model ########
    global_conf = config["global"]
    weights_file = str(global_conf['weights_file']) + '.' + str(
        global_conf['test_weights_iters'])

    zmodel, kmodel = load_model(config)

    # test y_pred from zoo model and keras model
    # keras2_y_pred = kmodel.predict(input_data, batch_size=batch_size)
    # y_pred = model.forward(input_data)
    # # y_pred = model.predict(input_data, distributed=False)
    # equal = np.allclose(y_pred, keras2_y_pred, rtol=1e-5, atol=1e-5)
    # print(equal)
    # return y_pred

    eval_metrics = OrderedDict()
    for mobj in config['metrics']:
        mobj = mobj.lower()
        if '@' in mobj:
            mt_key, mt_val = mobj.split('@', 1)
            eval_metrics[mobj] = metrics.get(mt_key)(int(mt_val))
        else:
            eval_metrics[mobj] = metrics.get(mobj)
    res = dict([[k, 0.] for k in eval_metrics.keys()])

    # batch_size = 20
    # query_data = np.random.randint(0, 10000, [batch_size, 10])
    # doc_data = np.random.randint(0, 10000, [batch_size, 40])
    # input_data = [query_data, doc_data]
    # keras2_y_pred = keras2_model.predict(input_data, batch_size=batch_size)
    # y_pred = model.predict(input_data, distributed=False)
    # equal = np.allclose(y_pred, keras2_y_pred, rtol=1e-5, atol=1e-5)
    for tag, generator in predict_gen.items():
        genfun = generator.get_batch_generator()
        print('[%s]\t[Predict] @ %s ' % (time.strftime(
            '%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag),
              end='')
        num_valid = 0
        res_scores = {}
        for input_data, y_true in genfun:
            ky_pred = kmodel.predict(input_data, batch_size=len(y_true))
            names = ['query', 'doc']
            shapes = [(None, 10), (None, 40)]
            list_input_data = _standardize_input_data(input_data,
                                                      names,
                                                      shapes,
                                                      check_batch_axis=False)
            # list_input_data = [data[0:2, :] for data in list_input_data]
            # y_pred = zmodel.predict(list_input_data, distributed=False)
            y_pred = zmodel.forward(list_input_data)
            equal = np.allclose(y_pred, ky_pred, rtol=1e-5, atol=1e-5)
            print(equal)

            if issubclass(type(generator),
                          inputs.list_generator.ListBasicGenerator):
                list_counts = input_data['list_counts']
                for k, eval_func in eval_metrics.items():
                    for lc_idx in range(len(list_counts) - 1):
                        pre = list_counts[lc_idx]
                        suf = list_counts[lc_idx + 1]
                        res[k] += eval_func(y_true=y_true[pre:suf],
                                            y_pred=y_pred[pre:suf])

                y_pred = np.squeeze(y_pred)
                for lc_idx in range(len(list_counts) - 1):
                    pre = list_counts[lc_idx]
                    suf = list_counts[lc_idx + 1]
                    for p, y, t in zip(input_data['ID'][pre:suf],
                                       y_pred[pre:suf], y_true[pre:suf]):
                        if p[0] not in res_scores:
                            res_scores[p[0]] = {}
                        res_scores[p[0]][p[1]] = (y, t)

                num_valid += len(list_counts) - 1
            else:
                for k, eval_func in eval_metrics.items():
                    res[k] += eval_func(y_true=y_true, y_pred=y_pred)
                for p, y, t in zip(input_data['ID'], y_pred, y_true):
                    if p[0] not in res_scores:
                        res_scores[p[0]] = {}
                    res_scores[p[0]][p[1]] = (y[1], t[1])
                num_valid += 1
        generator.reset()

        if tag in output_conf:
            if output_conf[tag]['save_format'] == 'TREC':
                with open(output_conf[tag]['save_path'], 'w') as f:
                    for qid, dinfo in res_scores.items():
                        dinfo = sorted(dinfo.items(),
                                       key=lambda d: d[1][0],
                                       reverse=True)
                        for inum, (did, (score, gt)) in enumerate(dinfo):
                            f.write('%s\tQ0\t%s\t%d\t%f\t%s\t%s\n' %
                                    (qid, did, inum, score, config['net_name'],
                                     gt))
            elif output_conf[tag]['save_format'] == 'TEXTNET':
                with open(output_conf[tag]['save_path'], 'w') as f:
                    for qid, dinfo in res_scores.items():
                        dinfo = sorted(dinfo.items(),
                                       key=lambda d: d[1][0],
                                       reverse=True)
                        for inum, (did, (score, gt)) in enumerate(dinfo):
                            f.write('%s %s %s %s\n' % (gt, qid, did, score))

        print('[Predict] results: ',
              '\t'.join(['%s=%f' % (k, v / num_valid)
                         for k, v in res.items()]),
              end='\n')
        sys.stdout.flush()
예제 #20
0
def train(config):

    print(json.dumps(config, indent=2), end='\n')
    # read basic config
    global_conf = config["global"]
    optimizer = global_conf['optimizer']
    optimizer = optimizers.get(optimizer)
    K.set_value(optimizer.lr, global_conf['learning_rate'])
    weights_file = str(global_conf['weights_file']) + '.%d'

    global logs_dir
    logs_dir = str(global_conf['logs'])
    if not os.path.exists(logs_dir):
        os.makedirs(logs_dir)

    display_interval = int(global_conf['display_interval'])
    num_iters = int(global_conf['num_iters'])
    save_weights_iters = int(global_conf['save_weights_iters'])

    # read input config
    input_conf = config['inputs']
    share_input_conf = input_conf['share']

    # collect embedding
    if 'embed_path' in share_input_conf:
        embed_dict = read_embedding(filename=share_input_conf['embed_path'])
        _PAD_ = share_input_conf['vocab_size'] - 1
        embed_dict[_PAD_] = np.zeros((share_input_conf['embed_size'], ),
                                     dtype=np.float32)
        embed = np.float32(
            np.random.uniform(-0.2, 0.2, [
                share_input_conf['vocab_size'], share_input_conf['embed_size']
            ]))
        share_input_conf['embed'] = convert_embed_2_numpy(embed_dict,
                                                          embed=embed)
    else:
        embed = np.float32(
            np.random.uniform(-0.2, 0.2, [
                share_input_conf['vocab_size'], share_input_conf['embed_size']
            ]))
        share_input_conf['embed'] = embed
    print('[Embedding] Embedding Load Done.', end='\n')

    # list all input tags and construct tags config
    input_train_conf = OrderedDict()
    input_eval_conf = OrderedDict()
    input_eval_loss_conf = OrderedDict()
    for tag in input_conf.keys():
        if 'phase' not in input_conf[tag]:
            continue
        if input_conf[tag]['phase'] == 'TRAIN':
            input_train_conf[tag] = {}
            input_train_conf[tag].update(share_input_conf)
            input_train_conf[tag].update(input_conf[tag])
            stats_for_plots[tag + '_loss'] = dict()
        elif input_conf[tag]['phase'] == 'EVAL':
            input_eval_conf[tag] = {}
            input_eval_conf[tag].update(share_input_conf)
            input_eval_conf[tag].update(input_conf[tag])
            stats_for_plots[tag] = dict()
        elif input_conf[tag]['phase'] == 'EVAL_LOSS':
            input_eval_loss_conf[tag] = {}
            input_eval_loss_conf[tag].update(share_input_conf)
            input_eval_loss_conf[tag].update(input_conf[tag])
            stats_for_plots[tag] = dict()
    print(
        '[Input] Process Input Tags. %s in TRAIN, %s in EVAL, %s in EVAL_LOSS.'
        % (input_train_conf.keys(), input_eval_conf.keys(),
           input_eval_loss_conf.keys()),
        end='\n')

    # collect dataset identification
    dataset = {}
    for tag in input_conf:
        if tag != 'share' and input_conf[tag]['phase'] == 'PREDICT':
            continue
        if 'text1_corpus' in input_conf[tag]:
            datapath = input_conf[tag]['text1_corpus']
            if datapath not in dataset:
                dataset[datapath], _ = read_data(datapath)
        if 'text2_corpus' in input_conf[tag]:
            datapath = input_conf[tag]['text2_corpus']
            if datapath not in dataset:
                dataset[datapath], _ = read_data(datapath)
    print('[Dataset] %s Dataset Load Done.' % len(dataset), end='\n')

    # initial data generator
    train_gen = OrderedDict()
    eval_gen = OrderedDict()
    eval_loss_gen = OrderedDict()

    for tag, conf in input_train_conf.items():
        print(conf, end='\n')
        conf['data1'] = dataset[conf['text1_corpus']]
        conf['data2'] = dataset[conf['text2_corpus']]
        generator = inputs.get(conf['input_type'])
        train_gen[tag] = generator(config=conf)

    for tag, conf in input_eval_conf.items():
        print(conf, end='\n')
        conf['data1'] = dataset[conf['text1_corpus']]
        conf['data2'] = dataset[conf['text2_corpus']]
        generator = inputs.get(conf['input_type'])
        eval_gen[tag] = generator(config=conf)

    for tag, conf in input_eval_loss_conf.items():
        print(conf, end='\n')
        conf['data1'] = dataset[conf['text1_corpus']]
        conf['data2'] = dataset[conf['text2_corpus']]
        generator = inputs.get(conf['input_type'])
        eval_loss_gen[tag] = generator(config=conf)

    ######### Load Model #########
    model = load_model(config)

    loss = []
    for lobj in config['losses']:
        if lobj['object_name'] in mz_specialized_losses:
            loss.append(
                rank_losses.get(lobj['object_name'])(lobj['object_params']))
        else:
            loss.append(rank_losses.get(lobj['object_name']))
        for k, v in stats_for_plots.items():
            if 'loss' in k:
                stats_for_plots[k][lobj['object_name']] = []
    eval_metrics = OrderedDict()
    for mobj in config['metrics']:
        mobj = mobj.lower()
        if '@' in mobj:
            mt_key, mt_val = mobj.split('@', 1)
            eval_metrics[mobj] = metrics.get(mt_key)(int(mt_val))
        else:
            eval_metrics[mobj] = metrics.get(mobj)
        for k, v in stats_for_plots.items():
            if 'loss' not in k:
                stats_for_plots[k][mobj] = []
    model.compile(optimizer=optimizer, loss=loss)
    print('[Model] Model Compile Done.', end='\n')

    for i_e in range(num_iters):
        for tag, generator in train_gen.items():
            genfun = generator.get_batch_generator()
            evalfun = eval_loss_gen['test_loss'].get_batch_generator()
            print('*' * 100)

            history = model.fit_generator(
                genfun,
                steps_per_epoch=display_interval,
                epochs=1,
                shuffle=False,
                verbose=0,
                validation_data=evalfun,
                validation_steps=display_interval,
                callbacks=[
                    TrainValTensorBoard(log_dir=os.path.join(
                        logs_dir, 'tensorboard'),
                                        global_step=display_interval * i_e,
                                        write_graph=False)
                ])  #callbacks=[eval_map])
            for k, v in stats_for_plots.items():
                if 'loss' in k:
                    print('[%s]\t[Train:%s] ' % (time.strftime(
                        '%m-%d-%Y %H:%M:%S', time.localtime(time.time())), k),
                          end='')
                    if any(srchstr in k
                           for srchstr in ('test', 'val', 'valid')):
                        _l = history.history['val_loss'][0]
                        stats_for_plots[k][lobj['object_name']].append(_l)
                    else:
                        _l = history.history['loss'][0]
                        stats_for_plots[k][lobj['object_name']].append(_l)
                    print('Iter:%d\t Loss =%.6f' % (i_e, _l), end='\n')
            print('-' * 50)

        for tag, generator in eval_gen.items():
            genfun = generator.get_batch_generator()
            print('[%s]\t[Eval:%s] ' % (time.strftime(
                '%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag),
                  end='')
            res = dict([[k, 0.] for k in eval_metrics.keys()])
            num_valid = 0
            #history_eval = model.evaluate_generator(genfun, steps=1)
            #print("history_eval: {}".format(history_eval))
            for input_data, y_true in genfun:
                y_pred = model.predict(input_data, batch_size=len(y_true))
                if issubclass(type(generator),
                              inputs.list_generator.ListBasicGenerator):
                    list_counts = input_data['list_counts']
                    for k, eval_func in eval_metrics.items():
                        for lc_idx in range(len(list_counts) - 1):
                            pre = list_counts[lc_idx]
                            suf = list_counts[lc_idx + 1]
                            res[k] += eval_func(y_true=y_true[pre:suf],
                                                y_pred=y_pred[pre:suf])
                    num_valid += len(list_counts) - 1
                else:
                    for k, eval_func in eval_metrics.items():
                        res[k] += eval_func(y_true=y_true, y_pred=y_pred)
                    num_valid += 1
            generator.reset()
            for k, v in res.items():
                stats_for_plots[tag][k].append(v / num_valid)

            print('Iter:%d\t%s' % (i_e, '\t'.join(
                ['%s=%f' % (k, v / num_valid) for k, v in res.items()])),
                  end='\n')
            sys.stdout.flush()
        if (i_e + 1) % save_weights_iters == 0:
            model.save_weights(weights_file % (i_e + 1))
    export_loss(False, 1)
    export_metrics(False, 1)