예제 #1
0
def generate_data(dataset, input_conf):
    gen = OrderedDict()
    for tag, conf in input_conf.items():
        print(conf, end='\n')
        conf['data1'] = dataset[conf['text1_corpus']]
        conf['data2'] = dataset[conf['text2_corpus']]
        generator = inputs.get(conf['input_type'])
        gen[tag] = generator(config=conf)
    return gen
예제 #2
0
def updateWatches():
    inputsList = inputs.get()

    namesToWatch = [a['name'] for a in config['applications'] if a['role'] == 'master']
    namesToDuck = [a['name'] for a in config['applications'] if a['role'] == 'slave']

    inputsToWatch = [i for i in inputsList if i.name in namesToWatch]
    inputsToDuck = [i for i in inputsList if i.name in namesToDuck]
    inputToDuck = inputsToDuck[0]
    inputsToWatchNew = [i for i in inputsToWatch if i.index not in watchingIndices]


    for inputToWatch in inputsToWatchNew:
        print(f"Started watching {inputToWatch}")    
        t = threading.Thread(target=reader.monitor, args=[inputToWatch, inputToDuck.index, supervisorInstance])    # added inputToDuck to duck first and only slave. 
        t.start()
        watchingIndices.append(inputToWatch.index)
예제 #3
0
def get_all(argumentos):
    algorit, number = get_from_args(argumentos)
    inp = inputs.get(algorit, number)
    img = imgs.get(algorit, number)
    show = shows.get(algorit, number)  # number é inutilizado aqui, mas serve
    return inp, img, show
예제 #4
0
def predict(config):
    print(json.dumps(config, indent=2), end='\n')
    input_conf = config['inputs']
    share_input_conf = input_conf['share']
    # collect embedding
    if 'embed_path' in share_input_conf:
        embed_dict = read_embedding(filename=share_input_conf['embed_path'])
        _PAD_ = share_input_conf['vocab_size'] - 1
        embed_dict[_PAD_] = np.zeros((share_input_conf['embed_size'], ),
                                     dtype=np.float32)
        embed = np.float32(
            np.random.uniform(-0.02, 0.02, [
                share_input_conf['vocab_size'], share_input_conf['embed_size']
            ]))
        share_input_conf['embed'] = convert_embed_2_numpy(embed_dict,
                                                          embed=embed)
    else:
        embed = np.float32(
            np.random.uniform(-0.2, 0.2, [
                share_input_conf['vocab_size'], share_input_conf['embed_size']
            ]))
        share_input_conf['embed'] = embed
    print('[Embedding] Embedding Load Done.', end='\n')

    # list all input tags and construct tags config
    input_predict_conf = OrderedDict()
    for tag in input_conf.keys():
        if 'phase' not in input_conf[tag]:
            continue
        if input_conf[tag]['phase'] == 'PREDICT':
            input_predict_conf[tag] = {}
            input_predict_conf[tag].update(share_input_conf)
            input_predict_conf[tag].update(input_conf[tag])
    print('[Input] Process Input Tags. %s in PREDICT.' %
          (input_predict_conf.keys()),
          end='\n')
    # collect dataset identification

    output_conf = config['outputs']

    ######## Load Model ########
    global_conf = config["global"]
    weights_file = str(global_conf['weights_file']) + '.' + str(
        global_conf['test_weights_iters'])

    model = load_model(config)
    model.load_weights(weights_file)
    sock = socket.socket(
        socket.AF_INET,  # Internet
        socket.SOCK_STREAM)  # UDP
    sock.bind((TCP_IP, TCP_PORT))

    print("Program is now ready for predictions")

    while True:
        sock.listen(1)

        conn, addr = sock.accept()
        numberOfMessages = conn.recv(64)
        numberOfMessages = int(numberOfMessages)
        conn.send("\n".encode(ENCODING))
        list_list_data = []
        for i in range(0, numberOfMessages):
            data = conn.recv(50000)
            data_string = str(data.decode(ENCODING))
            list_data = data_string.split("\n")
            for d in list_data:
                d_stripped = d.split(" ")
                if (len(d_stripped) > 2):
                    list_list_data.append(
                        (d_stripped[0], d_stripped[1], d_stripped[2]))
            conn.send("\n".encode(ENCODING))
        qData = conn.recv(1000)
        qData_string = str(qData.decode(ENCODING))
        list_qData = qData_string.split("\n")
        list_list_qData = {}
        dataset = {}
        for d in list_qData:
            line = d.strip().split()
            tid = line[0]
            list_list_qData[tid] = list(map(int, line[2:]))
        dataset['querydata'] = list_list_qData
        conn.send("\n".encode(ENCODING))
        sizeData = conn.recv(50)
        sizeData_int = int(sizeData.decode(ENCODING))
        list_dData = []
        for i in range(0, sizeData_int):
            conn.send("\n".encode(ENCODING))
            dData = conn.recv(50000)
            dData_string = str(dData.decode(ENCODING))
            #print(dData_string)
            list_dData.append(dData_string)
        list_list_dData = {}
        for d in list_dData:
            line = d.strip().split()
            tid = line[0]
            list_list_dData[tid] = list(map(int, line[2:]))

        dataset['documentdata'] = list_list_dData
        predict_gen = OrderedDict()
        for tag, conf in input_predict_conf.items():
            conf['data2'] = dataset['documentdata']
            conf['data1'] = dataset['querydata']
            generator = inputs.get(conf['input_type'])
            predict_gen[tag] = generator(
                #data1 = dataset[conf['text1_corpus']],
                #data2 = dataset[conf['text2_corpus']],
                config=conf,
                rel_data=list_list_data)
        dataset = {}

        for tag, generator in predict_gen.items():
            genfun = generator.get_batch_generator()
            for input_data, y_true in genfun:
                y_pred = model.predict(input_data, batch_size=len(y_true))
                print("Sending message")

                message = " ".join(map(str, y_pred.tolist()))
                message = message + '\n'
                #print(sys.getsizeof(message))
                #print(message)
                #sendSock.setsockopt(socket.SOL_SOCKET, socket.SO_SNDBUF, 4112)  # Buffer size 8192
                conn.send(message.encode(ENCODING))
예제 #5
0
    plt.ylabel('Velocity (m/s)')
    plt.xlabel('Time (seconds)')
    plt.axis([0, timeList[-1], 0, (velocityList[-1] + 10)])
    plt.plot(timeList, velocityList)

    #places arrow on graph to show terminal velocity and the time
    arrowString = 'Terminal:\n' + str(velocityList[-1]) + ' m/s\n' + str(
        timeList[-1]) + ' seconds'
    plt.annotate((arrowString),
                 xy=(timeList[-1], velocityList[-1]),
                 xytext=(timeList[-1] - 10, velocityList[-1] - 20),
                 arrowprops=dict(facecolor='black', shrink=0.05))

    #displays the graph
    plt.show()


if __name__ == "__main__":

    #gathers all necessary variables to calculate model
    timeInterval, acceleration, initialVelocity, dragCoefficient, mass, area = inputs.get(
    )

    #passes those variables onto calculate()
    timeList, accelerationList, velocityList, distanceList = calculate(
        timeInterval, acceleration, initialVelocity, dragCoefficient, mass,
        area)

    #gets lists and passes them to be rendered
    renderTable(timeList, accelerationList, velocityList, distanceList)
예제 #6
0
            dataset[datapath], _ = read_data(datapath)
    if 'text2_corpus' in input_conf[tag]:
        datapath = input_conf[tag]['text2_corpus']
        if datapath not in dataset:
            dataset[datapath], _ = read_data(datapath)
print('[Dataset] %s Dataset Load Done.' % len(dataset), end='\n')

# initial data generator
train_gen = OrderedDict()
eval_gen = OrderedDict()

for tag, conf in input_train_conf.items():
    print(conf, end='\n')
    conf['data1'] = dataset[conf['text1_corpus']]
    conf['data2'] = dataset[conf['text2_corpus']]
    generator = inputs.get(conf['input_type'])
    train_gen[tag] = generator(config=conf)

for tag, conf in input_eval_conf.items():
    print(conf, end='\n')
    conf['data1'] = dataset[conf['text1_corpus']]
    conf['data2'] = dataset[conf['text2_corpus']]
    generator = inputs.get(conf['input_type'])
    eval_gen[tag] = generator(config=conf)

import models
sys.path.append('/home/xingyuchen/jupyter/matchzoo/models/')
from matchpyramid import *
from anmm import *
model_config = config['model']['setting']
model_config.update(config['inputs']['share'])
예제 #7
0
def train(config):

    print(json.dumps(config, indent=2))
    # read basic config
    global_conf = config["global"]
    optimizer = global_conf['optimizer']
    weights_file = str(global_conf['weights_file']) + '.%d'
    display_interval = int(global_conf['display_interval'])
    num_iters = int(global_conf['num_iters'])
    save_weights_iters = int(global_conf['save_weights_iters'])
    is_save_weights = global_conf['is_save_weights']

    # read input config
    input_conf = config['inputs']
    share_input_conf = input_conf['share']

    # prepare the corpus files and reference files for computing BLEU/ROUGE-L metrics
    corpus_file = share_input_conf['corpus_file']
    test_ref_list = read_refs(share_input_conf['test_ref_file'])
    valid_ref_list = read_refs(share_input_conf['valid_ref_file'])
    corpus_dict = {}
    with open(corpus_file) as fin:
        for l in fin:
            tok = l.split(' ')
            corpus_dict[tok[0]] = ' '.join(tok[2:])

    # collect embedding
    if 'embed_path' in share_input_conf:
        embed_dict = read_embedding(filename=share_input_conf['embed_path'])
        _PAD_ = share_input_conf['vocab_size'] - 1
        embed_dict[_PAD_] = np.zeros((share_input_conf['embed_size'], ),
                                     dtype=np.float32)
        embed = np.float32(
            np.random.uniform(-0.2, 0.2, [
                share_input_conf['vocab_size'], share_input_conf['embed_size']
            ]))
        share_input_conf['embed'] = convert_embed_2_numpy(embed_dict,
                                                          embed=embed)
    else:
        embed = np.float32(
            np.random.uniform(-0.2, 0.2, [
                share_input_conf['vocab_size'], share_input_conf['embed_size']
            ]))
        share_input_conf['embed'] = embed
    print '[Embedding] Embedding Load Done.'

    # list all input tags and construct tags config
    input_train_conf = OrderedDict()
    input_eval_conf = OrderedDict()
    for tag in input_conf.keys():
        if 'phase' not in input_conf[tag]:
            continue
        if input_conf[tag]['phase'] == 'TRAIN':
            input_train_conf[tag] = {}
            input_train_conf[tag].update(share_input_conf)
            input_train_conf[tag].update(input_conf[tag])
        elif input_conf[tag]['phase'] == 'EVAL':
            input_eval_conf[tag] = {}
            input_eval_conf[tag].update(share_input_conf)
            input_eval_conf[tag].update(input_conf[tag])
    print '[Input] Process Input Tags. %s in TRAIN, %s in EVAL.' % (
        input_train_conf.keys(), input_eval_conf.keys())

    # collect dataset identification
    dataset = {}
    for tag in input_conf:
        if tag != 'share' and input_conf[tag]['phase'] == 'PREDICT':
            continue
        if 'text1_corpus' in input_conf[tag]:
            datapath = input_conf[tag]['text1_corpus']
            if datapath not in dataset:
                dataset[datapath], _ = read_data(datapath)
        if 'text2_corpus' in input_conf[tag]:
            datapath = input_conf[tag]['text2_corpus']
            if datapath not in dataset:
                dataset[datapath], _ = read_data(datapath)
    print '[Dataset] %s Dataset Load Done.' % len(dataset)

    # initial data generator
    train_gen = OrderedDict()
    eval_gen = OrderedDict()

    for tag, conf in input_train_conf.items():
        print conf
        conf['data1'] = dataset[conf['text1_corpus']]
        conf['data2'] = dataset[conf['text2_corpus']]
        generator = inputs.get(conf['input_type'])
        train_gen[tag] = generator(config=conf)

    for tag, conf in input_eval_conf.items():
        print conf
        conf['data1'] = dataset[conf['text1_corpus']]
        conf['data2'] = dataset[conf['text2_corpus']]
        generator = inputs.get(conf['input_type'])
        eval_gen[tag] = generator(config=conf)

    output_conf = config['outputs']

    ######### Load Model #########
    model = load_model(config)

    loss = []
    for lobj in config['losses']:
        if lobj['object_name'] in mz_specialized_losses:
            loss.append(
                rank_losses.get(lobj['object_name'])(lobj['object_params']))
        else:
            loss.append(rank_losses.get(lobj['object_name']))
    eval_metrics = OrderedDict()
    for mobj in config['metrics']:
        mobj = mobj.lower()
        if '@' in mobj:
            mt_key, mt_val = mobj.split('@', 1)
            eval_metrics[mobj] = metrics.get(mt_key)(int(mt_val))
        else:
            eval_metrics[mobj] = metrics.get(mobj)
    model.compile(optimizer=optimizer, loss=loss)
    print '[Model] Model Compile Done.'

    for i_e in range(num_iters):
        for tag, generator in train_gen.items():
            genfun = generator.get_batch_generator()
            print '[%s]\t[Train:%s]' % (time.strftime(
                '%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag),
            history = model.fit_generator(genfun,
                                          steps_per_epoch=display_interval,
                                          epochs=1,
                                          shuffle=False,
                                          verbose=0)  #callbacks=[eval_map])
            print 'Iter:%d\tloss=%.6f' % (i_e, history.history['loss'][0])

        for tag, generator in eval_gen.items():
            #print('test tag: ', tag)
            genfun = generator.get_batch_generator()
            # print '[%s]\t[Eval:%s]' % (time.strftime('%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag),
            # res = dict([[k,0.] for k in eval_metrics.keys()])
            res_scores = {
            }  # 2D dict; key qid-did ;value: predict_score, ground_truth
            for input_data, y_true in genfun:
                y_pred = model.predict(input_data, batch_size=len(y_true))
                if issubclass(type(generator),
                              inputs.list_generator.ListBasicGenerator):
                    list_counts = input_data[
                        'list_counts']  # list_counts store the boundries between documents under different queries
                    y_pred = np.squeeze(y_pred)
                    for lc_idx in range(len(list_counts) - 1):
                        pre = list_counts[lc_idx]
                        suf = list_counts[lc_idx + 1]
                        for p, y, t in zip(input_data['ID'][pre:suf],
                                           y_pred[pre:suf], y_true[pre:suf]):
                            if p[0] not in res_scores:
                                res_scores[p[0]] = {}
                            res_scores[p[0]][p[1]] = (y, t)
                else:
                    NameError('not supported in this version!')
            generator.reset()
            sys.stdout.flush()
            # save predicted score files for valid/test data
            if (i_e + 1) % save_weights_iters == 0:
                score_list = []
                with open(
                        output_conf['predict']['save_path_during_train'] +
                        '-' + tag + '.' + str(i_e + 1), 'w') as f:
                    for qid, dinfo in res_scores.items():
                        dinfo = sorted(dinfo.items(),
                                       key=lambda d: d[1][0],
                                       reverse=True)
                        for inum, (did, (score, gt)) in enumerate(dinfo):
                            score_l = '%s\tQ0\t%s\t%d\t%f\t%s\t%s' % (
                                qid, did, inum, score, config['net_name'], gt)
                            print >> f, score_l
                            score_list.append(score_l)
                # compute BLEU/ROUGE metrics at this check point
                ref_list = test_ref_list if tag == 'test' else valid_ref_list
                bleu_rouge_metrics = compute_bleu_rouge_given_scores_in_train(
                    score_list, corpus_dict, ref_list, tag)
                print '[%s]\t[Eval:%s] Iter:%d\t(bleu1-4 corpus_bleu rougel dist1 dist2 avglen)\t%s' \
                    % (time.strftime('%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag, i_e+1, bleu_rouge_metrics)

        if (
                i_e + 1
        ) % save_weights_iters and is_save_weights == "1":  # add an option to control saving weight files or not
            model.save_weights(weights_file % (i_e + 1))
예제 #8
0
def train(config):

    print(json.dumps(config, indent=2), end='\n')
    # read basic config
    global_conf = config["global"]
    weights_file = str(global_conf['weights_file']) + '.%d'
    display_interval = int(global_conf['display_interval'])
    num_iters = int(global_conf['num_iters'])
    save_weights_iters = int(global_conf['save_weights_iters'])

    # read input config
    input_conf = config['inputs']
    share_input_conf = input_conf['share']

    # collect embedding
    if 'embed_path' in share_input_conf:
        embed_dict = read_embedding(filename=share_input_conf['embed_path'])
        _PAD_ = share_input_conf['vocab_size'] - 1
        embed_dict[_PAD_] = np.zeros((share_input_conf['embed_size'], ),
                                     dtype=np.float32)
        embed = np.float32(
            np.random.uniform(-0.2, 0.2, [
                share_input_conf['vocab_size'], share_input_conf['embed_size']
            ]))
        share_input_conf['embed'] = convert_embed_2_numpy(embed_dict,
                                                          embed=embed)
    else:
        embed = np.float32(
            np.random.uniform(-0.2, 0.2, [
                share_input_conf['vocab_size'], share_input_conf['embed_size']
            ]))
        share_input_conf['embed'] = embed
    print('[Embedding] Embedding Load Done.', end='\n')

    # list all input tags and construct tags config
    input_train_conf = OrderedDict()
    input_eval_conf = OrderedDict()
    for tag in input_conf.keys():
        if 'phase' not in input_conf[tag]:
            continue
        if input_conf[tag]['phase'] == 'TRAIN':
            input_train_conf[tag] = {}
            input_train_conf[tag].update(share_input_conf)
            input_train_conf[tag].update(input_conf[tag])
        elif input_conf[tag]['phase'] == 'EVAL':
            input_eval_conf[tag] = {}
            input_eval_conf[tag].update(share_input_conf)
            input_eval_conf[tag].update(input_conf[tag])
    print('[Input] Process Input Tags. %s in TRAIN, %s in EVAL.' %
          (input_train_conf.keys(), input_eval_conf.keys()),
          end='\n')

    # collect dataset identification
    dataset = {}
    for tag in input_conf:
        if tag != 'share' and input_conf[tag]['phase'] == 'PREDICT':
            continue
        if 'text1_corpus' in input_conf[tag]:
            datapath = input_conf[tag]['text1_corpus']
            if datapath not in dataset:
                dataset[datapath], _ = read_data(datapath)
        if 'text2_corpus' in input_conf[tag]:
            datapath = input_conf[tag]['text2_corpus']
            if datapath not in dataset:
                dataset[datapath], _ = read_data(datapath)
    print('[Dataset] %s Dataset Load Done.' % len(dataset), end='\n')

    # initial data generator
    train_gen = OrderedDict()
    eval_gen = OrderedDict()

    for tag, conf in input_train_conf.items():
        print(conf, end='\n')
        conf['data1'] = dataset[conf['text1_corpus']]
        conf['data2'] = dataset[conf['text2_corpus']]
        generator = inputs.get(conf['input_type'])
        train_gen[tag] = generator(config=conf)

    for tag, conf in input_eval_conf.items():
        print(conf, end='\n')
        conf['data1'] = dataset[conf['text1_corpus']]
        conf['data2'] = dataset[conf['text2_corpus']]
        generator = inputs.get(conf['input_type'])
        eval_gen[tag] = generator(config=conf)

    ######### Load Model #########
    zmodel, kmodel = load_model(config)

    input = Input(name='input', shape=(2, 50))
    timeDistributed = TimeDistributed(layer=zmodel, input_shape=(2, 50))(input)
    z_knrm_model = Model(input=input, output=timeDistributed)

    eval_metrics = OrderedDict()
    for mobj in config['metrics']:
        mobj = mobj.lower()
        if '@' in mobj:
            mt_key, mt_val = mobj.split('@', 1)
            eval_metrics[mobj] = metrics.get(mt_key)(int(mt_val))
        else:
            eval_metrics[mobj] = metrics.get(mobj)

    epoch_num = 400
    batch_size = 200  # take a look at the config
    batch_num_per_epoch = 10
    #train_as_whole(z_knrm_model, zmodel, train_gen, eval_gen, eval_metrics)
    z_knrm_model.set_tensorboard("/tmp/matchzoo", "knrm-sgd-1e4")
    # train_per_epoch(z_knrm_model, zmodel, train_gen, eval_gen, eval_metrics, optimMethod=SGD(1e-4))

    train_per_epoch(z_knrm_model,
                    zmodel,
                    train_gen,
                    eval_gen,
                    eval_metrics,
                    optimMethod=SGD(1e-4,
                                    leaningrate_schedule=Poly(0.5, 50 * 400)))
예제 #9
0
def train(config, para_path):

    print(json.dumps(config, indent=2), end='\n')
    # read basic config
    global_conf = config["global"]
    optimizer = global_conf['optimizer']
    optimizer = optimizers.get(optimizer)
    K.set_value(optimizer.lr, global_conf['learning_rate'])
    #weights_file = str(global_conf['weights_file']) + '.%d'
    display_interval = int(global_conf['display_interval'])
    num_iters = int(global_conf['num_iters'])
    save_weights_iters = int(global_conf['save_weights_iters'])

    # read input config
    input_conf = config['inputs']
    share_input_conf = input_conf['share']

    #an_config = json.load(open('./data/pinfo/config.py', 'r'))
    an_config = json.load(open(para_path, 'r'))
    dstdir = an_config["model_dst_dir"]
    if not os.path.exists(an_config['weights_dir']):
        os.mkdir(an_config['weights_dir'])
    weights_file = an_config['weights_dir'] + str(global_conf['weights_file'])
    config['metrics'] = an_config["metrics"]
    config['model']['model_path'] = an_config['model_path']

    share_input_conf['embed_path'] = dstdir + "embed_glove_d300"
    share_input_conf[
        'word_triletter_map_file'] = dstdir + "word_triletter_map.txt"
    share_input_conf['vocab_size'] = word_len(dstdir + "word_dict.txt")
    share_input_conf['text1_corpus'] = dstdir + "corpus_preprocessed.txt"
    share_input_conf['text2_corpus'] = dstdir + "corpus_preprocessed.txt"
    input_conf['train']['relation_file'] = dstdir + "relation_train.txt"
    input_conf['valid']['relation_file'] = dstdir + "relation_valid.txt"
    input_conf['test']['relation_file'] = dstdir + "relation_test.txt"
    input_conf['train'][
        'hist_feats_file'] = dstdir + "relation_train.binsum-20.txt"
    input_conf['valid'][
        'hist_feats_file'] = dstdir + "relation_valid.binsum-20.txt"
    input_conf['test'][
        'hist_feats_file'] = dstdir + "relation_test.binsum-20.txt"

    # collect embedding
    if 'embed_path' in share_input_conf:
        embed_dict = read_embedding(filename=share_input_conf['embed_path'])
        _PAD_ = share_input_conf['vocab_size'] - 1
        embed_dict[_PAD_] = np.zeros((share_input_conf['embed_size'], ),
                                     dtype=np.float32)
        embed = np.float32(
            np.random.uniform(-0.2, 0.2, [
                share_input_conf['vocab_size'], share_input_conf['embed_size']
            ]))
        share_input_conf['embed'] = convert_embed_2_numpy(embed_dict,
                                                          embed=embed)
    else:
        # if no embed provided, use random
        embed = np.float32(
            np.random.uniform(-0.2, 0.2, [
                share_input_conf['vocab_size'], share_input_conf['embed_size']
            ]))
        share_input_conf['embed'] = embed
    print('[Embedding] Embedding Load Done.', end='\n')

    # list all input tags and construct tags config
    input_train_conf = OrderedDict()
    input_eval_conf = OrderedDict()
    # print("input_conf", input_conf)
    # print("input_conf keys", input_conf.keys())
    for tag in input_conf.keys():
        if 'phase' not in input_conf[tag]:
            continue
        if input_conf[tag]['phase'] == 'TRAIN':
            input_train_conf[tag] = {}
            input_train_conf[tag].update(share_input_conf)
            input_train_conf[tag].update(input_conf[tag])
        elif input_conf[tag]['phase'] == 'EVAL':
            input_eval_conf[tag] = {}
            input_eval_conf[tag].update(share_input_conf)
            input_eval_conf[tag].update(input_conf[tag])
    print('[Input] Process Input Tags. %s in TRAIN, %s in EVAL.' %
          (input_train_conf.keys(), input_eval_conf.keys()),
          end='\n')
    # print("input_train_conf", input_train_conf)
    # collect dataset identification
    dataset = {}
    for tag in input_conf:
        if tag != 'share' and input_conf[tag]['phase'] == 'PREDICT':
            continue
        if 'text1_corpus' in input_conf[tag]:
            datapath = input_conf[tag]['text1_corpus']
            if datapath not in dataset:
                dataset[datapath], _ = read_data(datapath)
        if 'text2_corpus' in input_conf[tag]:
            datapath = input_conf[tag]['text2_corpus']
            if datapath not in dataset:
                dataset[datapath], _ = read_data(datapath)
    print('[Dataset] %s Dataset Load Done.' % len(dataset), end='\n')

    # initial data generator
    train_gen = OrderedDict()
    eval_gen = OrderedDict()

    for tag, conf in input_train_conf.items():
        print(conf, end='\n')
        conf['data1'] = dataset[conf['text1_corpus']]
        conf['data2'] = dataset[conf['text2_corpus']]
        generator = inputs.get(conf['input_type'])
        train_gen[tag] = generator(config=conf)

    for tag, conf in input_eval_conf.items():
        print(conf, end='\n')
        conf['data1'] = dataset[conf['text1_corpus']]
        conf['data2'] = dataset[conf['text2_corpus']]
        generator = inputs.get(conf['input_type'])
        eval_gen[tag] = generator(config=conf)

    ######### Load Model #########
    model = load_model(config)
    # weights_file1 = str(global_conf['weights_file']) + '.' + str(global_conf['test_weights_iters'])
    # model.load_weights(weights_file1)
    loss = []
    for lobj in config['losses']:
        if lobj['object_name'] in mz_specialized_losses:
            loss.append(
                rank_losses.get(lobj['object_name'])(lobj['object_params']))
        else:
            loss.append(rank_losses.get(lobj['object_name']))
    eval_metrics = OrderedDict()
    for mobj in config['metrics']:
        mobj = mobj.lower()
        if '@' in mobj:
            mt_key, mt_val = mobj.split('@', 1)
            eval_metrics[mobj] = metrics.get(mt_key)(int(mt_val))
        else:
            eval_metrics[mobj] = metrics.get(mobj)
    model.compile(optimizer=optimizer, loss=loss)

    print('[Model] Model Compile Done.', end='\n')

    base_metric = an_config["base_metric"]
    best_epoch = 0
    best_metric = -1
    best_result = ''
    start_time = time.clock()
    for i_e in range(num_iters):
        for tag, generator in train_gen.items():
            genfun = generator.get_batch_generator()
            print('[%s]\t[Train:%s] ' % (time.strftime(
                '%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag),
                  end='')
            history = model.fit_generator(genfun,
                                          steps_per_epoch=display_interval,
                                          epochs=1,
                                          shuffle=False,
                                          verbose=0)  #callbacks=[eval_map])
            print('Iter:%d\tloss=%.6f' % (i_e, history.history['loss'][0]),
                  end='\n')

        for tag, generator in eval_gen.items():
            genfun = generator.get_batch_generator()
            print('[%s]\t[Eval:%s] ' % (time.strftime(
                '%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag),
                  end='')
            res = dict([[k, 0.] for k in eval_metrics.keys()])
            num_valid = 0
            for input_data, y_true in genfun:
                y_pred = model.predict(input_data, batch_size=len(y_true))
                if issubclass(type(generator),
                              inputs.list_generator.ListBasicGenerator):
                    list_counts = input_data['list_counts']
                    for k, eval_func in eval_metrics.items():
                        for lc_idx in range(len(list_counts) - 1):
                            pre = list_counts[lc_idx]
                            suf = list_counts[lc_idx + 1]
                            res[k] += eval_func(y_true=y_true[pre:suf],
                                                y_pred=y_pred[pre:suf])
                    num_valid += len(list_counts) - 1
                else:
                    for k, eval_func in eval_metrics.items():
                        res[k] += eval_func(y_true=y_true, y_pred=y_pred)
                    num_valid += 1
            generator.reset()
            print('Iter:%d\t%s' % (i_e, '\t'.join(
                ['%s=%f' % (k, v / num_valid) for k, v in res.items()])),
                  end='\n')
            cur_metric = res[base_metric] / num_valid
            cur_res = '\t'.join(
                ['%s=%f' % (k, v / num_valid) for k, v in res.items()])
            cur_metric_ls = {}
            for k, v in res.items():
                cur_metric_ls[k] = round(v / num_valid, 4)
            if cur_metric > best_metric and tag == 'valid':
                best_epoch = i_e
                best_metric = cur_metric
                best_result = cur_res
                best_metric_ls = cur_metric_ls
                model.save_weights(weights_file)
            sys.stdout.flush()
        #if (i_e+1) % save_weights_iters == 0:
        #model.save_weights(weights_file % (i_e+1))
    end_time = time.clock()
    print('the best running result %s the best epoch %d' %
          (best_result, best_epoch))
    print('the running time %s seconds' % (end_time - start_time))
    db.insert_result(task_id, model_id, best_metric_ls)
예제 #10
0
파일: main.py 프로젝트: hhh920406/MatchZoo
def train(config):

    print(json.dumps(config, indent=2), end='\n')
    # read basic config
    global_conf = config["global"]
    optimizer = global_conf['optimizer']
    optimizer=optimizers.get(optimizer)
    K.set_value(optimizer.lr, global_conf['learning_rate'])
    weights_file = str(global_conf['weights_file']) + '.%d'
    display_interval = int(global_conf['display_interval'])
    num_iters = int(global_conf['num_iters'])
    save_weights_iters = int(global_conf['save_weights_iters'])

    # read input config
    input_conf = config['inputs']
    share_input_conf = input_conf['share']


    # collect embedding
    if 'embed_path' in share_input_conf:
        embed_dict = read_embedding(filename=share_input_conf['embed_path'])
        _PAD_ = share_input_conf['vocab_size'] - 1
        embed_dict[_PAD_] = np.zeros((share_input_conf['embed_size'], ), dtype=np.float32)
        embed = np.float32(np.random.uniform(-0.2, 0.2, [share_input_conf['vocab_size'], share_input_conf['embed_size']]))
        share_input_conf['embed'] = convert_embed_2_numpy(embed_dict, embed = embed)
    else:
        embed = np.float32(np.random.uniform(-0.2, 0.2, [share_input_conf['vocab_size'], share_input_conf['embed_size']]))
        share_input_conf['embed'] = embed
    print('[Embedding] Embedding Load Done.', end='\n')

    # list all input tags and construct tags config
    input_train_conf = OrderedDict()
    input_eval_conf = OrderedDict()
    for tag in input_conf.keys():
        if 'phase' not in input_conf[tag]:
            continue
        if input_conf[tag]['phase'] == 'TRAIN':
            input_train_conf[tag] = {}
            input_train_conf[tag].update(share_input_conf)
            input_train_conf[tag].update(input_conf[tag])
        elif input_conf[tag]['phase'] == 'EVAL':
            input_eval_conf[tag] = {}
            input_eval_conf[tag].update(share_input_conf)
            input_eval_conf[tag].update(input_conf[tag])
    print('[Input] Process Input Tags. %s in TRAIN, %s in EVAL.' % (input_train_conf.keys(), input_eval_conf.keys()), end='\n')

    # collect dataset identification
    dataset = {}
    for tag in input_conf:
        if tag != 'share' and input_conf[tag]['phase'] == 'PREDICT':
            continue
        if 'text1_corpus' in input_conf[tag]:
            datapath = input_conf[tag]['text1_corpus']
            if datapath not in dataset:
                dataset[datapath], _ = read_data(datapath)
        if 'text2_corpus' in input_conf[tag]:
            datapath = input_conf[tag]['text2_corpus']
            if datapath not in dataset:
                dataset[datapath], _ = read_data(datapath)
    print('[Dataset] %s Dataset Load Done.' % len(dataset), end='\n')

    # initial data generator
    train_gen = OrderedDict()
    eval_gen = OrderedDict()

    for tag, conf in input_train_conf.items():
        print(conf, end='\n')
        conf['data1'] = dataset[conf['text1_corpus']]
        conf['data2'] = dataset[conf['text2_corpus']]
        generator = inputs.get(conf['input_type'])
        train_gen[tag] = generator( config = conf )

    for tag, conf in input_eval_conf.items():
        print(conf, end='\n')
        conf['data1'] = dataset[conf['text1_corpus']]
        conf['data2'] = dataset[conf['text2_corpus']]
        generator = inputs.get(conf['input_type'])
        eval_gen[tag] = generator( config = conf )

    ######### Load Model #########
    model = load_model(config)

    loss = []
    for lobj in config['losses']:
        if lobj['object_name'] in mz_specialized_losses:
            loss.append(rank_losses.get(lobj['object_name'])(lobj['object_params']))
        else:
            loss.append(rank_losses.get(lobj['object_name']))
    eval_metrics = OrderedDict()
    for mobj in config['metrics']:
        mobj = mobj.lower()
        if '@' in mobj:
            mt_key, mt_val = mobj.split('@', 1)
            eval_metrics[mobj] = metrics.get(mt_key)(int(mt_val))
        else:
            eval_metrics[mobj] = metrics.get(mobj)
    model.compile(optimizer=optimizer, loss=loss)
    print('[Model] Model Compile Done.', end='\n')

    for i_e in range(num_iters):
        for tag, generator in train_gen.items():
            genfun = generator.get_batch_generator()
            print('[%s]\t[Train:%s] ' % (time.strftime('%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag), end='')
            history = model.fit_generator(
                    genfun,
                    steps_per_epoch = display_interval,
                    epochs = 1,
                    shuffle=False,
                    verbose = 0
                ) #callbacks=[eval_map])
            print('Iter:%d\tloss=%.6f' % (i_e, history.history['loss'][0]), end='\n')

        for tag, generator in eval_gen.items():
            genfun = generator.get_batch_generator()
            print('[%s]\t[Eval:%s] ' % (time.strftime('%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag), end='')
            res = dict([[k,0.] for k in eval_metrics.keys()])
            num_valid = 0
            for input_data, y_true in genfun:
                y_pred = model.predict(input_data, batch_size=len(y_true))
                if issubclass(type(generator), inputs.list_generator.ListBasicGenerator):
                    list_counts = input_data['list_counts']
                    for k, eval_func in eval_metrics.items():
                        for lc_idx in range(len(list_counts)-1):
                            pre = list_counts[lc_idx]
                            suf = list_counts[lc_idx+1]
                            res[k] += eval_func(y_true = y_true[pre:suf], y_pred = y_pred[pre:suf])
                    num_valid += len(list_counts) - 1
                else:
                    for k, eval_func in eval_metrics.items():
                        res[k] += eval_func(y_true = y_true, y_pred = y_pred)
                    num_valid += 1
            generator.reset()
            print('Iter:%d\t%s' % (i_e, '\t'.join(['%s=%f'%(k,v/num_valid) for k, v in res.items()])), end='\n')
            sys.stdout.flush()
        if (i_e+1) % save_weights_iters == 0:
            model.save_weights(weights_file % (i_e+1))
예제 #11
0
파일: main.py 프로젝트: hhh920406/MatchZoo
def predict(config):
    ######## Read input config ########

    print(json.dumps(config, indent=2), end='\n')
    input_conf = config['inputs']
    share_input_conf = input_conf['share']

    # collect embedding
    if 'embed_path' in share_input_conf:
        embed_dict = read_embedding(filename=share_input_conf['embed_path'])
        _PAD_ = share_input_conf['vocab_size'] - 1
        embed_dict[_PAD_] = np.zeros((share_input_conf['embed_size'], ), dtype=np.float32)
        embed = np.float32(np.random.uniform(-0.02, 0.02, [share_input_conf['vocab_size'], share_input_conf['embed_size']]))
        share_input_conf['embed'] = convert_embed_2_numpy(embed_dict, embed = embed)
    else:
        embed = np.float32(np.random.uniform(-0.2, 0.2, [share_input_conf['vocab_size'], share_input_conf['embed_size']]))
        share_input_conf['embed'] = embed
    print('[Embedding] Embedding Load Done.', end='\n')

    # list all input tags and construct tags config
    input_predict_conf = OrderedDict()
    for tag in input_conf.keys():
        if 'phase' not in input_conf[tag]:
            continue
        if input_conf[tag]['phase'] == 'PREDICT':
            input_predict_conf[tag] = {}
            input_predict_conf[tag].update(share_input_conf)
            input_predict_conf[tag].update(input_conf[tag])
    print('[Input] Process Input Tags. %s in PREDICT.' % (input_predict_conf.keys()), end='\n')

    # collect dataset identification
    dataset = {}
    for tag in input_conf:
        if tag == 'share' or input_conf[tag]['phase'] == 'PREDICT':
            if 'text1_corpus' in input_conf[tag]:
                datapath = input_conf[tag]['text1_corpus']
                if datapath not in dataset:
                    dataset[datapath], _ = read_data(datapath)
            if 'text2_corpus' in input_conf[tag]:
                datapath = input_conf[tag]['text2_corpus']
                if datapath not in dataset:
                    dataset[datapath], _ = read_data(datapath)
    print('[Dataset] %s Dataset Load Done.' % len(dataset), end='\n')

    # initial data generator
    predict_gen = OrderedDict()

    for tag, conf in input_predict_conf.items():
        print(conf, end='\n')
        conf['data1'] = dataset[conf['text1_corpus']]
        conf['data2'] = dataset[conf['text2_corpus']]
        generator = inputs.get(conf['input_type'])
        predict_gen[tag] = generator(
                                    #data1 = dataset[conf['text1_corpus']],
                                    #data2 = dataset[conf['text2_corpus']],
                                     config = conf )

    ######## Read output config ########
    output_conf = config['outputs']

    ######## Load Model ########
    global_conf = config["global"]
    weights_file = str(global_conf['weights_file']) + '.' + str(global_conf['test_weights_iters'])

    model = load_model(config)
    model.load_weights(weights_file)

    eval_metrics = OrderedDict()
    for mobj in config['metrics']:
        mobj = mobj.lower()
        if '@' in mobj:
            mt_key, mt_val = mobj.split('@', 1)
            eval_metrics[mobj] = metrics.get(mt_key)(int(mt_val))
        else:
            eval_metrics[mobj] = metrics.get(mobj)
    res = dict([[k,0.] for k in eval_metrics.keys()])

    for tag, generator in predict_gen.items():
        genfun = generator.get_batch_generator()
        print('[%s]\t[Predict] @ %s ' % (time.strftime('%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag), end='')
        num_valid = 0
        res_scores = {}
        for input_data, y_true in genfun:
            y_pred = model.predict(input_data, batch_size=len(y_true) )

            if issubclass(type(generator), inputs.list_generator.ListBasicGenerator):
                list_counts = input_data['list_counts']
                for k, eval_func in eval_metrics.items():
                    for lc_idx in range(len(list_counts)-1):
                        pre = list_counts[lc_idx]
                        suf = list_counts[lc_idx+1]
                        res[k] += eval_func(y_true = y_true[pre:suf], y_pred = y_pred[pre:suf])

                y_pred = np.squeeze(y_pred)
                for lc_idx in range(len(list_counts)-1):
                    pre = list_counts[lc_idx]
                    suf = list_counts[lc_idx+1]
                    for p, y, t in zip(input_data['ID'][pre:suf], y_pred[pre:suf], y_true[pre:suf]):
                        if p[0] not in res_scores:
                            res_scores[p[0]] = {}
                        res_scores[p[0]][p[1]] = (y, t)

                num_valid += len(list_counts) - 1
            else:
                for k, eval_func in eval_metrics.items():
                    res[k] += eval_func(y_true = y_true, y_pred = y_pred)
                for p, y, t in zip(input_data['ID'], y_pred, y_true):
                    if p[0] not in res_scores:
                        res_scores[p[0]] = {}
                    res_scores[p[0]][p[1]] = (y[1], t[1])
                num_valid += 1
        generator.reset()

        if tag in output_conf:
            if output_conf[tag]['save_format'] == 'TREC':
                with open(output_conf[tag]['save_path'], 'w') as f:
                    for qid, dinfo in res_scores.items():
                        dinfo = sorted(dinfo.items(), key=lambda d:d[1][0], reverse=True)
                        for inum,(did, (score, gt)) in enumerate(dinfo):
                            f.write('%s\tQ0\t%s\t%d\t%f\t%s\t%s\n'%(qid, did, inum, score, config['net_name'], gt))
            elif output_conf[tag]['save_format'] == 'TEXTNET':
                with open(output_conf[tag]['save_path'], 'w') as f:
                    for qid, dinfo in res_scores.items():
                        dinfo = sorted(dinfo.items(), key=lambda d:d[1][0], reverse=True)
                        for inum,(did, (score, gt)) in enumerate(dinfo):
                            f.write('%s %s %s %s\n'%(gt, qid, did, score))

        print('[Predict] results: ', '\t'.join(['%s=%f'%(k,v/num_valid) for k, v in res.items()]), end='\n')
        sys.stdout.flush()
예제 #12
0
def predict(config):
    ######## Read input config ########

    print(json.dumps(config, indent=2), end='\n')
    input_conf = config['inputs']
    share_input_conf = input_conf['share']

    # collect embedding
    if 'embed_path' in share_input_conf:
        embed_dict = read_embedding(filename=share_input_conf['embed_path'])
        _PAD_ = share_input_conf['vocab_size'] - 1
        embed_dict[_PAD_] = np.zeros((share_input_conf['embed_size'], ),
                                     dtype=np.float32)
        embed = np.float32(
            np.random.uniform(-0.02, 0.02, [
                share_input_conf['vocab_size'], share_input_conf['embed_size']
            ]))
        share_input_conf['embed'] = convert_embed_2_numpy(embed_dict,
                                                          embed=embed)
    else:
        embed = np.float32(
            np.random.uniform(-0.2, 0.2, [
                share_input_conf['vocab_size'], share_input_conf['embed_size']
            ]))
        share_input_conf['embed'] = embed
    print('[Embedding] Embedding Load Done.', end='\n')

    if 'pos_embed_path' in share_input_conf:
        embed_dict = read_embedding(
            filename=share_input_conf['pos_embed_path'])
        _PAD_ = share_input_conf['pos_vocab_size'] - 1
        embed_dict[_PAD_] = np.zeros((share_input_conf['pos_embed_size'], ),
                                     dtype=np.float32)
        embed = np.float32(
            np.random.uniform(-0.2, 0.2, [
                share_input_conf['pos_vocab_size'],
                share_input_conf['pos_embed_size']
            ]))
        share_input_conf['pos_embed'] = convert_embed_2_numpy(embed_dict,
                                                              embed=embed)
    print('[Embedding] POS Embedding Load Done.', end='\n')

    # list all input tags and construct tags config
    input_predict_conf = OrderedDict()
    for tag in input_conf.keys():
        if 'phase' not in input_conf[tag]:
            continue
        if input_conf[tag]['phase'] == 'PREDICT':
            input_predict_conf[tag] = {}
            input_predict_conf[tag].update(share_input_conf)
            input_predict_conf[tag].update(input_conf[tag])
    print('[Input] Process Input Tags. %s in PREDICT.' %
          (input_predict_conf.keys()),
          end='\n')

    # collect dataset identification
    dataset = {}
    for tag in input_conf:
        if tag == 'share' or input_conf[tag]['phase'] == 'PREDICT':
            if 'text1_corpus' in input_conf[tag]:
                datapath = input_conf[tag]['text1_corpus']
                if datapath not in dataset:
                    dataset[datapath], _ = read_data(datapath)
            if 'text2_corpus' in input_conf[tag]:
                datapath = input_conf[tag]['text2_corpus']
                if datapath not in dataset:
                    dataset[datapath], _ = read_data(datapath)
            if 'text1_postag_corpus' in input_conf[tag]:
                datapath = input_conf[tag]['text1_postag_corpus']
                if datapath not in dataset:
                    dataset[datapath], _ = read_data(datapath)
            if 'text2_postag_corpus' in input_conf[tag]:
                datapath = input_conf[tag]['text2_postag_corpus']
                if datapath not in dataset:
                    dataset[datapath], _ = read_data(datapath)
    print('[Dataset] %s Dataset Load Done.' % len(dataset), end='\n')

    # initial data generator
    predict_gen = OrderedDict()

    for tag, conf in input_predict_conf.items():
        print(conf, end='\n')
        conf['data1'] = dataset[conf['text1_corpus']]
        conf['data2'] = dataset[conf['text2_corpus']]
        if 'text1_postag_corpus' in conf:
            conf['postag_data1'] = dataset[conf['text1_postag_corpus']]
        if 'text2_postag_corpus' in conf:
            conf['postag_data2'] = dataset[conf['text2_postag_corpus']]
        generator = inputs.get(conf['input_type'])
        predict_gen[tag] = generator(
            #data1 = dataset[conf['text1_corpus']],
            #data2 = dataset[conf['text2_corpus']],
            config=conf)

    ######## Read output config ########
    output_conf = config['outputs']

    ######## Load Model ########
    global_conf = config["global"]
    weights_file = str(global_conf['weights_file']) + '.' + str(
        global_conf['test_weights_iters'])

    model = load_model(config)
    model.load_weights(weights_file)

    generator = predict_gen['predict']
    input_data, y_true = generator.get_batch_generator().next()

    term_list1 = 'Double dopaminergic thistle'
    term_list2 = 'completes herbs textures topical'
    input_data = InputTransformer(config).build().genr_input(
        term_list1, term_list2)

    y_pred = model.predict(input_data)

    return y_pred
예제 #13
0
파일: main.py 프로젝트: snowcement/MatchZoo
def train(config):
    #json.dumps()用于将dict类型的数据转成str,因为如果直接将dict类型的数据写入json文件中会发生报错,因此在将数据写入时需要用到该函数
    print(json.dumps(config, indent=2), end='\n')
    # read basic config
    global_conf = config["global"]
    optimizer = global_conf['optimizer']
    optimizer = optimizers.get(optimizer)  #总是调用keras.optimizers
    K.set_value(
        optimizer.lr,
        global_conf['learning_rate'])  #使用 Numpy 数组设置变量的值. lr:learning rate
    weights_file = str(global_conf['weights_file']) + '.%d'
    display_interval = int(global_conf['display_interval'])
    num_iters = int(global_conf['num_iters'])
    save_weights_iters = int(global_conf['save_weights_iters'])

    # read input config
    input_conf = config['inputs']
    share_input_conf = input_conf['share']

    # collect embedding
    if 'embed_path' in share_input_conf:
        embed_dict = read_embedding(filename=share_input_conf['embed_path'])
        _PAD_ = share_input_conf['vocab_size'] - 1
        embed_dict[_PAD_] = np.zeros((share_input_conf['embed_size'], ),
                                     dtype=np.float32)
        embed = np.float32(
            np.random.uniform(-0.2, 0.2, [
                share_input_conf['vocab_size'], share_input_conf['embed_size']
            ]))
        share_input_conf['embed'] = convert_embed_2_numpy(embed_dict,
                                                          embed=embed)
    else:
        embed = np.float32(
            np.random.uniform(-0.2, 0.2, [
                share_input_conf['vocab_size'], share_input_conf['embed_size']
            ]))
        share_input_conf['embed'] = embed
    print('[Embedding] Embedding Load Done.', end='\n')

    # list all input tags and construct tags config
    input_train_conf = OrderedDict()  #使用OrderedDict会根据放入元素的先后顺序进行排序
    input_eval_conf = OrderedDict()
    for tag in input_conf.keys():
        if 'phase' not in input_conf[tag]:
            continue
        if input_conf[tag]['phase'] == 'TRAIN':
            input_train_conf[tag] = {}
            input_train_conf[tag].update(share_input_conf)
            input_train_conf[tag].update(input_conf[tag])
        elif input_conf[tag]['phase'] == 'EVAL':
            input_eval_conf[tag] = {}
            input_eval_conf[tag].update(share_input_conf)
            input_eval_conf[tag].update(input_conf[tag])
    print(
        '[Input] Process Input Tags. %s in TRAIN, %s in EVAL.' %
        (input_train_conf.keys(), input_eval_conf.keys()),
        end='\n'
    )  #odict_keys(['train']) in TRAIN, odict_keys(['valid', 'test']) in EVAL.

    # collect dataset identification
    dataset = {}
    for tag in input_conf:
        if tag != 'share' and input_conf[tag]['phase'] == 'PREDICT':
            continue
        if 'text1_corpus' in input_conf[tag]:
            datapath = input_conf[tag]['text1_corpus']
            if datapath not in dataset:
                dataset[datapath], _ = read_data(datapath)
        if 'text2_corpus' in input_conf[tag]:
            datapath = input_conf[tag]['text2_corpus']
            if datapath not in dataset:
                dataset[datapath], _ = read_data(datapath)
    print('[Dataset] %s Dataset Load Done.' % len(dataset), end='\n')

    # initial data generator
    train_gen = OrderedDict()
    eval_gen = OrderedDict()

    for tag, conf in input_train_conf.items():
        print(conf, end='\n')
        conf['data1'] = dataset[conf['text1_corpus']]
        conf['data2'] = dataset[conf['text2_corpus']]
        generator = inputs.get(conf['input_type'])
        train_gen[tag] = generator(config=conf)

    for tag, conf in input_eval_conf.items():
        print(conf, end='\n')
        conf['data1'] = dataset[conf['text1_corpus']]
        conf['data2'] = dataset[conf['text2_corpus']]
        generator = inputs.get(conf['input_type'])
        eval_gen[tag] = generator(config=conf)

    ######### Load Model #########
    model = load_model(config)

    loss = []
    for lobj in config['losses']:
        if lobj['object_name'] in mz_specialized_losses:
            loss.append(
                rank_losses.get(lobj['object_name'])(lobj['object_params']))
        else:
            loss.append(rank_losses.get(lobj['object_name']))
    eval_metrics = OrderedDict()
    for mobj in config['metrics']:
        mobj = mobj.lower()
        if '@' in mobj:
            mt_key, mt_val = mobj.split('@', 1)
            eval_metrics[mobj] = metrics.get(mt_key)(int(mt_val))
        else:
            eval_metrics[mobj] = metrics.get(mobj)
    model.compile(optimizer=optimizer, loss=loss)
    print('[Model] Model Compile Done.', end='\n')
    #add tensorboard check
    # board = keras.callbacks.TensorBoard(log_dir='../data/toy_example/logs', histogram_freq=0)
    # history = LossHistory()

    for i_e in range(num_iters):  #num_iters类似epochs
        for tag, generator in train_gen.items():
            #genfun生成器生成batch_size*2个样本(一半正样本,一半负样本)
            #display_interval = len(pair_list)//(batch_size*2)
            genfun = generator.get_batch_generator()
            print('[%s]\t[Train:%s] ' % (time.strftime(
                '%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag),
                  end='')
            history = model.fit_generator(
                genfun,
                steps_per_epoch=display_interval,
                epochs=1,
                shuffle=False,
                verbose=0,
                #callbacks = [history, board]
            )  #callbacks=[eval_map])
            print('Iter:%d\tloss=%.6f' % (i_e, history.history['loss'][0]),
                  end='\n')

        for tag, generator in eval_gen.items():
            genfun = generator.get_batch_generator()
            print('[%s]\t[Eval:%s] ' % (time.strftime(
                '%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag),
                  end='')
            res = dict([[k, 0.] for k in eval_metrics.keys()])
            num_valid = 0
            for input_data, y_true in genfun:
                y_pred = model.predict(input_data, batch_size=len(y_true))
                if issubclass(type(generator),
                              inputs.list_generator.ListBasicGenerator):
                    list_counts = input_data['list_counts']
                    for k, eval_func in eval_metrics.items():
                        for lc_idx in range(len(list_counts) - 1):
                            pre = list_counts[lc_idx]
                            suf = list_counts[lc_idx + 1]
                            res[k] += eval_func(y_true=y_true[pre:suf],
                                                y_pred=y_pred[pre:suf])
                    num_valid += len(list_counts) - 1
                else:
                    for k, eval_func in eval_metrics.items():
                        res[k] += eval_func(y_true=y_true, y_pred=y_pred)
                    num_valid += 1
            generator.reset()
            print('Iter:%d\t%s' % (i_e, '\t'.join(
                ['%s=%f' % (k, v / num_valid) for k, v in res.items()])),
                  end='\n')
            sys.stdout.flush()
        if (i_e + 1) % save_weights_iters == 0:
            model.save_weights(weights_file % (i_e + 1))
예제 #14
0
파일: main1.py 프로젝트: 787264137/ks
                dataset[datapath], _ = read_data(datapath)
        if 'text2_corpus' in input_conf[tag]:
            datapath = input_conf[tag]['text2_corpus']
            if datapath not in dataset:
                dataset[datapath], _ = read_data(datapath)
    print('[Dataset] %s Dataset Load Done.' % len(dataset), end='\n')

    # initial data generator
    train_gen = OrderedDict()
    eval_gen = OrderedDict()

    for tag, conf in input_train_conf.items():
        print(conf, end='\n')
        conf['data1'] = dataset[conf['text1_corpus']]
        conf['data2'] = dataset[conf['text2_corpus']]
        generator = inputs.get(conf['input_type'])
<<<<<<< HEAD
        train_gen[tag] = generator( config = conf )
=======
        train_gen[tag] = generator(config=conf)
>>>>>>> 6636718fe1739e29021c42bc19be0f43de359b94

    for tag, conf in input_eval_conf.items():
        print(conf, end='\n')
        conf['data1'] = dataset[conf['text1_corpus']]
        conf['data2'] = dataset[conf['text2_corpus']]
        generator = inputs.get(conf['input_type'])  # 获取对应的generator
<<<<<<< HEAD
        eval_gen[tag] = generator( config = conf )  # 通过对应的generator加载数据,获取 X1, X1_len, X2, X2_len, Y
=======
        eval_gen[tag] = generator(config=conf)  # 通过对应的generator加载数据,获取 X1, X1_len, X2, X2_len, Y
예제 #15
0
def predict(config):
    ######## Read input config ########

    print(json.dumps(config, indent=2))
    input_conf = config['inputs']
    share_input_conf = input_conf['share']

    # collect embedding
    if 'embed_path' in share_input_conf:
        embed_dict = read_embedding(filename=share_input_conf['embed_path'])
        _PAD_ = share_input_conf['fill_word']
        embed_dict[_PAD_] = np.zeros((share_input_conf['embed_size'], ),
                                     dtype=np.float32)
        embed = np.float32(
            np.random.uniform(-0.02, 0.02, [
                share_input_conf['vocab_size'], share_input_conf['embed_size']
            ]))
        share_input_conf['embed'] = convert_embed_2_numpy(embed_dict,
                                                          embed=embed)
    else:
        embed = np.float32(
            np.random.uniform(-0.2, 0.2, [
                share_input_conf['vocab_size'], share_input_conf['embed_size']
            ]))
        share_input_conf['embed'] = embed
    print '[Embedding] Embedding Load Done.'

    # list all input tags and construct tags config
    input_predict_conf = OrderedDict()
    for tag in input_conf.keys():
        if 'phase' not in input_conf[tag]:
            continue
        if input_conf[tag]['phase'] == 'PREDICT':
            input_predict_conf[tag] = {}
            input_predict_conf[tag].update(share_input_conf)
            input_predict_conf[tag].update(input_conf[tag])
    print '[Input] Process Input Tags. %s in PREDICT.' % (
        input_predict_conf.keys())

    # collect dataset identification
    dataset = {}
    for tag in input_conf:
        if tag == 'share' or input_conf[tag]['phase'] == 'PREDICT':
            if 'text1_corpus' in input_conf[tag]:
                datapath = input_conf[tag]['text1_corpus']
                if datapath not in dataset:
                    dataset[datapath], _ = read_data(datapath)
            if 'text2_corpus' in input_conf[tag]:
                datapath = input_conf[tag]['text2_corpus']
                if datapath not in dataset:
                    dataset[datapath], _ = read_data(datapath)
    print '[Dataset] %s Dataset Load Done.' % len(dataset)

    # initial data generator
    predict_gen = OrderedDict()

    for tag, conf in input_predict_conf.items():
        print conf
        conf['data1'] = dataset[conf['text1_corpus']]
        conf['data2'] = dataset[conf['text2_corpus']]
        generator = inputs.get(conf['input_type'])
        predict_gen[tag] = generator(
            #data1 = dataset[conf['text1_corpus']],
            #data2 = dataset[conf['text2_corpus']],
            config=conf)

    ######## Read output config ########
    output_conf = config['outputs']

    ######## Load Model ########
    global_conf = config["global"]
    weights_file = global_conf['weights_file']

    model = load_model(config)
    model.load_weights(weights_file)

    eval_metrics = OrderedDict()
    for mobj in config['metrics']:
        mobj = mobj.lower()
        if '@' in mobj:
            mt_key, mt_val = mobj.split('@', 1)
            eval_metrics[mobj] = metrics.get(mt_key)(int(mt_val))
        else:
            eval_metrics[mobj] = metrics.get(mobj)
    res = dict([[k, 0.] for k in eval_metrics.keys()])

    for tag, generator in predict_gen.items():
        genfun = generator.get_batch_generator()
        print '[Predict] @ %s ' % tag,
        num_valid = 0
        res_scores = {}
        for input_data, y_true in genfun:
            list_counts = input_data['list_counts']
            y_pred = model.predict(input_data, batch_size=len(y_true))

            for k, eval_func in eval_metrics.items():
                for lc_idx in range(len(list_counts) - 1):
                    pre = list_counts[lc_idx]
                    suf = list_counts[lc_idx + 1]
                    res[k] += eval_func(y_true=y_true[pre:suf],
                                        y_pred=y_pred[pre:suf])

            y_pred = np.squeeze(y_pred)
            for lc_idx in range(len(list_counts) - 1):
                pre = list_counts[lc_idx]
                suf = list_counts[lc_idx + 1]
                for p, y, t in zip(input_data['ID'][pre:suf], y_pred[pre:suf],
                                   y_true[pre:suf]):
                    if p[0] not in res_scores:
                        res_scores[p[0]] = {}
                    res_scores[p[0]][p[1]] = (y, t)

            num_valid += len(list_counts) - 1
        generator.reset()

        if tag in output_conf:
            if output_conf[tag]['save_format'] == 'TREC':
                with open(output_conf[tag]['save_path'], 'w') as f:
                    for qid, dinfo in res_scores.items():
                        dinfo = sorted(dinfo.items(),
                                       key=lambda d: d[1][0],
                                       reverse=True)
                        for inum, (did, (score, gt)) in enumerate(dinfo):
                            print >> f, '%s\tQ0\t%s\t%d\t%f\t%s' % (
                                qid, did, inum, score, config['net_name'])
            elif output_conf[tag]['save_format'] == 'TEXTNET':
                with open(output_conf[tag]['save_path'], 'w') as f:
                    for qid, dinfo in res_scores.items():
                        dinfo = sorted(dinfo.items(),
                                       key=lambda d: d[1][0],
                                       reverse=True)
                        for inum, (did, (score, gt)) in enumerate(dinfo):
                            print >> f, '%s %s %s %s' % (gt, qid, did, score)

        print '[Predict] results: ', '  '.join(
            ['%s:%f' % (k, v / num_valid) for k, v in res.items()])
        sys.stdout.flush()
예제 #16
0
def predict(config):
    ######## Read input config ########

    print(json.dumps(config, indent=2))
    input_conf = config['inputs']
    share_input_conf = input_conf['share']

    # collect embedding
    if 'embed_path' in share_input_conf:
        embed_dict = read_embedding(filename=share_input_conf['embed_path'])
        _PAD_ = share_input_conf['vocab_size'] - 1
        embed_dict[_PAD_] = np.zeros((share_input_conf['embed_size'], ), dtype=np.float32)
        embed = np.float32(np.random.uniform(-0.02, 0.02, [share_input_conf['vocab_size'], share_input_conf['embed_size']]))
        share_input_conf['embed'] = convert_embed_2_numpy(embed_dict, embed = embed)
    else:
        embed = np.float32(np.random.uniform(-0.2, 0.2, [share_input_conf['vocab_size'], share_input_conf['embed_size']]))
        share_input_conf['embed'] = embed
    print '[Embedding] Embedding Load Done.'

    # list all input tags and construct tags config
    input_predict_conf = OrderedDict()
    for tag in input_conf.keys():
        if 'phase' not in input_conf[tag]:
            continue
        if input_conf[tag]['phase'] == 'PREDICT':
            input_predict_conf[tag] = {}
            input_predict_conf[tag].update(share_input_conf)
            input_predict_conf[tag].update(input_conf[tag])
    print '[Input] Process Input Tags. %s in PREDICT.' % (input_predict_conf.keys())

    # collect dataset identification
    dataset = {}
    for tag in input_conf:
        if tag == 'share' or input_conf[tag]['phase'] == 'PREDICT':
            if 'text1_corpus' in input_conf[tag]:
                datapath = input_conf[tag]['text1_corpus']
                if datapath not in dataset:
                    dataset[datapath] = read_data_2d(datapath)
            if 'text2_corpus' in input_conf[tag]:
                datapath = input_conf[tag]['text2_corpus']
                if datapath not in dataset:
                    dataset[datapath] = read_data_2d(datapath)
            if 'qa_comat_file' in input_conf[tag]:  # qa_comat_file for qa_cooccur_matrix in DMN_KD_CQA and DMN_KD_Web
                datapath = input_conf[tag]['qa_comat_file']
                if datapath not in dataset:
                    dataset[datapath] = read_qa_comat(datapath)
    print '[Dataset] %s Dataset Load Done.' % len(dataset)

    # initial data generator
    predict_gen = OrderedDict()

    for tag, conf in input_predict_conf.items():
        print conf
        conf['data1'] = dataset[conf['text1_corpus']]
        conf['data2'] = dataset[conf['text2_corpus']]
        if 'qa_comat_file' in share_input_conf:
            conf['qa_comat'] = dataset[conf['qa_comat_file']]
        generator = inputs.get(conf['input_type'])
        predict_gen[tag] = generator(
                                    #data1 = dataset[conf['text1_corpus']],
                                    #data2 = dataset[conf['text2_corpus']],
                                     config = conf )

    ######## Read output config ########
    output_conf = config['outputs']

    ######## Load Model ########
    global_conf = config["global"]
    weights_file = str(global_conf['weights_file']) + '.' + str(global_conf['test_weights_iters']) + '-' + str(seed)

    if config['net_name'] == 'DMN_CNN_MTL':
        model, model_clf = load_model(config)
        model.load_weights(weights_file)
    elif config['net_name'] == 'DMN_CNN_INTENTS':
        model_clf = load_model(config)
        model_clf.load_weights(weights_file)
    elif config['net_name'] == 'DMN_CNN_MTL_Web' or config['net_name'] == 'DMN_CNN_MTL_Web_v2':
        model, model_web = load_model(config)
        model.load_weights(weights_file)
        weights_file_web = str(global_conf['weights_file_web']) + '.' + str(global_conf['test_weights_iters']) + '-' + str(seed)
        model_web.load_weights(weights_file_web)
    elif config['net_name'] == 'DMN_CNN_MTL_All':
        model, model_web, model_clf = load_model(config)
        model.load_weights(weights_file)
        weights_file_web = str(global_conf['weights_file_web']) + '.' + str(
            global_conf['test_weights_iters']) + '-' + str(seed)
        model_web.load_weights(weights_file_web)
    else:
        model = load_model(config)
        model.load_weights(weights_file)

    eval_metrics = OrderedDict()
    for mobj in config['metrics']:
        mobj = mobj.lower()
        if '@' in mobj:
            mt_key, mt_val = mobj.split('@', 1)
            eval_metrics[mobj] = metrics.get(mt_key)(int(mt_val))
        else:
            eval_metrics[mobj] = metrics.get(mobj)

    res = dict([[k,0.] for k in eval_metrics.keys()])

    for tag, generator in predict_gen.items():
        genfun = generator.get_batch_generator()
        print '[%s]\t[Predict] @ %s ' % (time.strftime('%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag),
        num_valid = 0
        res_scores = {}

        if tag == 'predict':
            model_to_evaluate = model
        elif tag == 'predict_clf':
            model_to_evaluate = model_clf
        elif tag == 'predict_web':
            model_to_evaluate = model_web

        for input_data, y_true in genfun:
            y_pred = model_to_evaluate.predict(input_data, batch_size=len(y_true))
            
            if tag == 'predict_clf':
                y_pred = np.argmax(y_pred, axis=1)

            if issubclass(type(generator), inputs.list_generator.ListBasicGenerator):
                list_counts = input_data['list_counts']
                for k, eval_func in eval_metrics.items():
                    for lc_idx in range(len(list_counts)-1):
                        pre = list_counts[lc_idx]
                        suf = list_counts[lc_idx+1]
                        res[k] += eval_func(y_true = y_true[pre:suf], y_pred = y_pred[pre:suf])

                y_pred = np.squeeze(y_pred)
                for lc_idx in range(len(list_counts) - 1):
                    pre = list_counts[lc_idx]
                    suf = list_counts[lc_idx + 1]
                    for p, y, t in zip(input_data['ID'][pre:suf], y_pred[pre:suf], y_true[pre:suf]):
                        if tag == 'predict_clf':
                            res_scores[p[0]] = (y, t)
                        else:
                            if p[0] not in res_scores:
                                res_scores[p[0]] = {}
                            res_scores[p[0]][p[1]] = (y, t)

                num_valid += len(list_counts) - 1
            else:
                for k, eval_func in eval_metrics.items():
                    res[k] += eval_func(y_true = y_true, y_pred = y_pred)
                for p, y, t in zip(input_data['ID'], y_pred, y_true):
                    if p[0] not in res_scores:
                        res_scores[p[0]] = {}
                    res_scores[p[0]][p[1]] = (y[1], t[1])
                num_valid += 1
        generator.reset()

        if tag in output_conf:
            save_path = output_conf[tag]['save_path'] + '-' + str(seed)

            if output_conf[tag]['save_format'] == 'TREC':
                with open(save_path, 'w') as f:
                    if tag == 'predict_clf':
                        for qid, entry in res_scores.items():
                            print >> f, '%s\t%d\t%d'%(qid, entry[0], entry[1])
                    else:
                        for qid, dinfo in res_scores.items():
                            dinfo = sorted(dinfo.items(), key=lambda d:d[1][0], reverse=True)
                            for inum,(did, (score, gt)) in enumerate(dinfo):
                                print >> f, '%s\tQ0\t%s\t%d\t%f\t%s\t%s'%(qid, did, inum, score, config['net_name'], gt)
            elif output_conf[tag]['save_format'] == 'TEXTNET':
                with open(save_path, 'w') as f:
                    for qid, dinfo in res_scores.items():
                        dinfo = sorted(dinfo.items(), key=lambda d:d[1][0], reverse=True)
                        for inum,(did, (score, gt)) in enumerate(dinfo):
                            print >> f, '%s %s %s %s'%(gt, qid, did, score)

        print '[Predict] results: ', '\t'.join(['%s=%f'%(k,v/num_valid) for k, v in res.items()])
        sys.stdout.flush()
예제 #17
0
def predict(config):
    ######## Read input config ########

    print(json.dumps(config, indent=2), end='\n')
    input_conf = config['inputs']
    share_input_conf = input_conf['share']

    # collect embedding
    if 'embed_path' in share_input_conf:
        embed_dict = read_embedding(filename=share_input_conf['embed_path'])
        _PAD_ = share_input_conf['vocab_size'] - 1
        embed_dict[_PAD_] = np.zeros((share_input_conf['embed_size'], ),
                                     dtype=np.float32)
        embed = np.float32(
            np.random.uniform(-0.02, 0.02, [
                share_input_conf['vocab_size'], share_input_conf['embed_size']
            ]))
        share_input_conf['embed'] = convert_embed_2_numpy(embed_dict,
                                                          embed=embed)
    else:
        embed = np.float32(
            np.random.uniform(-0.2, 0.2, [
                share_input_conf['vocab_size'], share_input_conf['embed_size']
            ]))
        share_input_conf['embed'] = embed
    print('[Embedding] Embedding Load Done.', end='\n')

    # list all input tags and construct tags config
    input_predict_conf = OrderedDict()
    for tag in input_conf.keys():
        if 'phase' not in input_conf[tag]:
            continue
        if input_conf[tag]['phase'] == 'PREDICT':
            input_predict_conf[tag] = {}
            input_predict_conf[tag].update(share_input_conf)
            input_predict_conf[tag].update(input_conf[tag])
    print('[Input] Process Input Tags. %s in PREDICT.' %
          (input_predict_conf.keys()),
          end='\n')

    # collect dataset identification
    dataset = {}
    for tag in input_conf:
        if tag == 'share' or input_conf[tag]['phase'] == 'PREDICT':
            if 'text1_corpus' in input_conf[tag]:
                datapath = input_conf[tag]['text1_corpus']
                if datapath not in dataset:
                    dataset[datapath], _ = read_data(datapath)
            if 'text2_corpus' in input_conf[tag]:
                datapath = input_conf[tag]['text2_corpus']
                if datapath not in dataset:
                    dataset[datapath], _ = read_data(datapath)
    print('[Dataset] %s Dataset Load Done.' % len(dataset), end='\n')

    # initial data generator
    predict_gen = OrderedDict()

    for tag, conf in input_predict_conf.items():
        print(conf, end='\n')
        conf['data1'] = dataset[conf['text1_corpus']]
        conf['data2'] = dataset[conf['text2_corpus']]
        generator = inputs.get(conf['input_type'])
        predict_gen[tag] = generator(
            #data1 = dataset[conf['text1_corpus']],
            #data2 = dataset[conf['text2_corpus']],
            config=conf)

    ######## Read output config ########
    output_conf = config['outputs']

    ######## Load Model ########
    global_conf = config["global"]
    weights_file = str(global_conf['weights_file']) + '.' + str(
        global_conf['test_weights_iters'])

    model = load_model(config)
    model.load_weights(weights_file)
    encoder = Model(inputs=model.input,
                    outputs=[
                        model.get_layer('att_layer_2').output,
                        model.get_layer('att_layer_3').output
                    ])
    # encoder = Model(inputs=model.input, outputs=model.get_layer('att_layer_2').output)
    eval_metrics = OrderedDict()
    for mobj in config['metrics']:
        mobj = mobj.lower()
        if '@' in mobj:
            mt_key, mt_val = mobj.split('@', 1)
            eval_metrics[mobj] = metrics.get(mt_key)(int(mt_val))
        else:
            eval_metrics[mobj] = metrics.get(mobj)
    res = dict([[k, 0.] for k in eval_metrics.keys()])

    # 这是为了打印query和sent的attention score
    for tag, generator in predict_gen.items():
        genfun = generator.get_batch_generator()
        print('[%s]\t[Predict] @ %s ' % (time.strftime(
            '%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag),
              end='')
        num_valid = 0
        res_scores = {}
        for input_data, y_true in genfun:

            y_pred1, y_pred2 = encoder.predict(input_data,
                                               batch_size=len(y_true))
            y_pred1 = _to_list(np.squeeze(y_pred1).tolist())
            y_pred2 = _to_list(np.squeeze(y_pred2).tolist())
            # print("y_pred", len(y_pred), len(y_pred[0]))
            print(input_data)
            print("sent", y_pred1)
            print("query", y_pred2)
            print()

    for tag, generator in predict_gen.items():
        genfun = generator.get_batch_generator()
        print('[%s]\t[Predict] @ %s ' % (time.strftime(
            '%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag),
              end='')
        num_valid = 0
        res_scores = {}
        for input_data, y_true in genfun:
            y_pred = model.predict(input_data, batch_size=len(y_true))

            if issubclass(type(generator),
                          inputs.list_generator.ListBasicGenerator):
                list_counts = input_data['list_counts']
                for k, eval_func in eval_metrics.items():
                    for lc_idx in range(len(list_counts) - 1):
                        pre = list_counts[lc_idx]
                        suf = list_counts[lc_idx + 1]
                        res[k] += eval_func(y_true=y_true[pre:suf],
                                            y_pred=y_pred[pre:suf])

                y_pred = np.squeeze(y_pred)
                for lc_idx in range(len(list_counts) - 1):
                    pre = list_counts[lc_idx]
                    suf = list_counts[lc_idx + 1]
                    for p, y, t in zip(input_data['ID'][pre:suf],
                                       y_pred[pre:suf], y_true[pre:suf]):
                        if p[0] not in res_scores:
                            res_scores[p[0]] = {}
                        res_scores[p[0]][p[1]] = (y, t)

                num_valid += len(list_counts) - 1
            else:
                for k, eval_func in eval_metrics.items():
                    res[k] += eval_func(y_true=y_true, y_pred=y_pred)
                for p, y, t in zip(input_data['ID'], y_pred, y_true):
                    if p[0] not in res_scores:
                        res_scores[p[0]] = {}
                    res_scores[p[0]][p[1]] = (y[1], t[1])
                num_valid += 1
        generator.reset()

        if tag in output_conf:
            if output_conf[tag]['save_format'] == 'TREC':
                with open(output_conf[tag]['save_path'], 'w') as f:
                    for qid, dinfo in res_scores.items():
                        dinfo = sorted(dinfo.items(),
                                       key=lambda d: d[1][0],
                                       reverse=True)
                        for inum, (did, (score, gt)) in enumerate(dinfo):
                            f.write('%s\tQ0\t%s\t%d\t%f\t%s\t%s\n' %
                                    (qid, did, inum, score, config['net_name'],
                                     gt))
            elif output_conf[tag]['save_format'] == 'TEXTNET':
                with open(output_conf[tag]['save_path'], 'w') as f:
                    for qid, dinfo in res_scores.items():
                        dinfo = sorted(dinfo.items(),
                                       key=lambda d: d[1][0],
                                       reverse=True)
                        for inum, (did, (score, gt)) in enumerate(dinfo):
                            f.write('%s %s %s %s\n' % (gt, qid, did, score))

        print('[Predict] results: ',
              '\t'.join(['%s=%f' % (k, v / num_valid)
                         for k, v in res.items()]),
              end='\n')
        sys.stdout.flush()
예제 #18
0
def train(config):

    if seed is None:
        raise Exception('Seed should be set')
    print('Using seed: ' + str(seed))
    # read basic config
    global_conf = config["global"]
    learning_rate = global_conf['learning_rate']
    use_existing_weights = global_conf['use_existing_weights'] if 'use_existing_weights' in global_conf else None
    optimizer = Adam(lr=learning_rate)
    weights_file = str(global_conf['weights_file']) + '.%d'
    weights_file_web = str(global_conf['weights_file_web']) + '.%d' if 'weights_file_web' in global_conf else None
    display_interval = int(global_conf['display_interval'])
    num_iters = int(global_conf['num_iters'])
    save_weights_iters = int(global_conf['save_weights_iters'])

    # read input config
    input_conf = config['inputs']
    share_input_conf = input_conf['share']

    # collect embedding
    if 'embed_path' in share_input_conf:
        embed_dict = read_embedding(filename=share_input_conf['embed_path'])
        _PAD_ = share_input_conf['vocab_size'] - 1
        embed_dict[_PAD_] = np.zeros((share_input_conf['embed_size'], ), dtype=np.float32)
        embed = np.float32(np.random.uniform(-0.2, 0.2, [share_input_conf['vocab_size'], share_input_conf['embed_size']]))
        share_input_conf['embed'] = convert_embed_2_numpy(embed_dict, embed = embed)
    else:
        embed = np.float32(np.random.uniform(-0.2, 0.2, [share_input_conf['vocab_size'], share_input_conf['embed_size']]))
        share_input_conf['embed'] = embed
    print '[Embedding] Embedding Load Done.'

    # list all input tags and construct tags config
    input_train_conf = OrderedDict()
    input_eval_conf = OrderedDict()
    for tag in input_conf.keys():
        if 'phase' not in input_conf[tag]:
            continue

        if input_conf[tag]['phase'] == 'TRAIN':
            input_train_conf[tag] = {}
            input_train_conf[tag].update(share_input_conf)
            input_train_conf[tag].update(input_conf[tag])
        elif input_conf[tag]['phase'] == 'EVAL':
            input_eval_conf[tag] = {}
            input_eval_conf[tag].update(share_input_conf)
            input_eval_conf[tag].update(input_conf[tag])

    # collect dataset identification
    dataset = {}
    for tag in input_conf:
        if tag != 'share' and input_conf[tag]['phase'] == 'PREDICT':
            continue
        if 'text1_corpus' in input_conf[tag]:
            datapath = input_conf[tag]['text1_corpus']
            if datapath not in dataset:
                dataset[datapath] = read_data_2d(datapath)
        if 'text2_corpus' in input_conf[tag]:
            datapath = input_conf[tag]['text2_corpus']
            if datapath not in dataset:
                dataset[datapath] = read_data_2d(datapath)
        if 'qa_comat_file' in input_conf[tag]: # qa_comat_file for qa_cooccur_matrix in DMN_KD
            datapath = input_conf[tag]['qa_comat_file']
            if datapath not in dataset:
                dataset[datapath] = read_qa_comat(datapath)

    print '[Dataset] %s Dataset Load Done.' % len(dataset)

    # initial data generator
    train_gen = OrderedDict()
    eval_gen = OrderedDict()

    for tag, conf in input_train_conf.items():
        print conf
        conf['data1'] = dataset[conf['text1_corpus']]
        conf['data2'] = dataset[conf['text2_corpus']]
        if 'qa_comat_file' in share_input_conf:
            conf['qa_comat'] = dataset[conf['qa_comat_file']]

        generator = inputs.get(conf['input_type'])
        train_gen[tag] = generator(config=conf)

    for tag, conf in input_eval_conf.items():
        print conf
        conf['data1'] = dataset[conf['text1_corpus']]
        conf['data2'] = dataset[conf['text2_corpus']]

        if 'qa_comat_file' in share_input_conf:
            conf['qa_comat'] = dataset[conf['qa_comat_file']]

        generator = inputs.get(conf['input_type'])
        eval_gen[tag] = generator(config=conf)

    ######### Load Model #########

    if config['net_name'] == 'DMN_CNN_MTL':
        model, model_clf = load_model(config)
        if use_existing_weights:
            weights_file_to_load = str(global_conf['weights_file']) + '.' + str(
                global_conf['weights_to_load']) + '-' + str(seed)
            model.load_weights(weights_file_to_load)

        model_clf.compile(optimizer=optimizer, loss=custom_loss)
        print '[Model] MTL models Compile Done.'
    elif config['net_name'] == 'DMN_CNN_INTENTS':
        model_clf = load_model(config)
        model_clf.compile(optimizer=optimizer, loss=custom_loss)
        print '[Model] Intent Only classifier model Compile Done.'
    elif config['net_name'] == 'DMN_CNN_MTL_Web' or config['net_name'] == 'DMN_CNN_MTL_Web_v2':
        model, model_web = load_model(config)
        if use_existing_weights:
            weights_file_to_load = str(global_conf['weights_file']) + '.' + str(
                global_conf['weights_to_load']) + '-' + str(seed)
            model.load_weights(weights_file_to_load)
            weights_file_web = str(global_conf['weights_file_web']) + '.' + str(
                global_conf['weights_to_load']) + '-' + str(seed)
            model_web.load_weights(weights_file_web)
    elif config['net_name'] == 'DMN_CNN_MTL_All':
        model, model_web, model_clf = load_model(config)
        model_clf.compile(optimizer=optimizer, loss=custom_loss)
    else:
        model = load_model(config)
        if use_existing_weights:
            weights_file_to_load = str(global_conf['weights_file']) + '.' + str(
                global_conf['test_weights_iters']) + '-' + str(seed)
            model.load_weights(weights_file_to_load)

        print '[Model] Response Ranking model Compile Done.'

    eval_metrics = OrderedDict()
    for mobj in config['metrics']:
        mobj = mobj.lower()
        if '@' in mobj:
            mt_key, mt_val = mobj.split('@', 1)
            eval_metrics[mobj] = metrics.get(mt_key)(int(mt_val))
        else:
            eval_metrics[mobj] = metrics.get(mobj)

    if config['net_name'] != 'DMN_CNN_INTENTS':
        loss = []
        for lobj in config['losses']:
            if lobj['object_name'] in mz_specialized_losses:
                loss.append(rank_losses.get(lobj['object_name'])(lobj['object_params']))
            else:
                loss.append(rank_losses.get(lobj['object_name']))

        model.compile(optimizer=optimizer, loss=loss)
        print '[Model] Model Compile Done.'

        if config['net_name'] == 'DMN_CNN_MTL_Web' or config['net_name'] == 'DMN_CNN_MTL_Web_v2' \
                or config['net_name'] == 'DMN_CNN_MTL_All':
            model_web.compile(optimizer=optimizer, loss=loss)
            print('[Model Web] Model Compile Done')

    if share_input_conf['predict'] == 'False':
        if 'test' in eval_gen:
            del eval_gen['test']
        if 'valid' in eval_gen:
            del eval_gen['valid']
        if 'eval_predict_in' in eval_gen:
            del eval_gen['eval_predict_in']

    for i_e in range(num_iters):
        for tag, generator in train_gen.items():
            genfun = generator.get_batch_generator()
            print '[%s]\t[Train:%s]' % (time.strftime('%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag),

            if tag == "train_clf":
                correct_model = model_clf
            elif tag == 'train_web':
                correct_model = model_web
            elif tag == "train":
                correct_model = model

            history = correct_model.fit_generator(
                genfun,
                steps_per_epoch=display_interval,  # if display_interval = 10, then there are 10 batches in 1 epoch
                epochs=1,
                shuffle=False,
                verbose=0)

            print 'Iter:%d\tloss=%.6f' % (i_e, history.history['loss'][0])

        if (i_e+1) % save_weights_iters == 0:
            for tag, generator in eval_gen.items():
                print('Evaluating tag:' + str(tag))
                genfun = generator.get_batch_generator()
                print '[%s]\t[Eval:%s]' % (time.strftime('%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag),
                res = dict([[k,0.] for k in eval_metrics.keys()])
                num_valid = 0

                if tag == "valid":
                    correct_model = model
                elif tag == "valid_web":
                    correct_model = model_web
                elif tag == "valid_clf":
                    correct_model = model_clf

                for input_data, y_true in genfun:
                    y_pred = correct_model.predict(input_data, batch_size=len(y_true))
                    if issubclass(type(generator), inputs.list_generator.ListBasicGenerator):
                        list_counts = input_data['list_counts']
                        for k, eval_func in eval_metrics.items():
                            for lc_idx in range(len(list_counts)-1):
                                pre = list_counts[lc_idx]
                                suf = list_counts[lc_idx+1]
                                res[k] += eval_func(y_true = y_true[pre:suf], y_pred = y_pred[pre:suf])
                        num_valid += len(list_counts) - 1
                    else:
                        for k, eval_func in eval_metrics.items():
                            res[k] += eval_func(y_true = y_true, y_pred = y_pred)
                        num_valid += 1
                generator.reset()
                print 'Iter:%d\t%s' % (i_e, '\t'.join(['%s=%f'%(k,v/num_valid) for k, v in res.items()]))
                sys.stdout.flush()

        sys.stdout.flush()

        weights_file_name = (weights_file % (i_e+1)) + '-' + str(seed)
        if (i_e+1) % save_weights_iters == 0:
            if config['net_name'] == 'DMN_CNN_MTL_Web' or config['net_name'] == 'DMN_CNN_MTL_Web_v2'  \
                    or config['net_name'] == 'DMN_CNN_MTL_All':
                weights_file_name_web = (weights_file_web % (i_e + 1)) + '-' + str(seed)
                model.save_weights(weights_file_name)
                model_web.save_weights(weights_file_name_web)
            elif config['net_name'] != 'DMN_CNN_INTENTS':
                model.save_weights(weights_file_name)
            else:
                model_clf.save_weights(weights_file_name)
예제 #19
0
def train(config):

    print(json.dumps(config, indent=2))
    # read basic config
    global_conf = config["global"]
    optimizer = global_conf['optimizer']
    weights_file = str(global_conf['weights_file']) + '.%d'
    display_interval = int(global_conf['display_interval'])
    num_iters = int(global_conf['num_iters'])
    save_weights_iters = int(global_conf['save_weights_iters'])

    # read input config
    input_conf = config['inputs']
    share_input_conf = input_conf['share']

    # collect embedding
    if 'embed_path' in share_input_conf:
        embed_dict = read_embedding(filename=share_input_conf['embed_path'])
        _PAD_ = share_input_conf['vocab_size'] - 1
        embed_dict[_PAD_] = np.zeros((share_input_conf['embed_size'], ),
                                     dtype=np.float32)
        embed = np.float32(
            np.random.uniform(-0.2, 0.2, [
                share_input_conf['vocab_size'], share_input_conf['embed_size']
            ]))
        share_input_conf['embed'] = convert_embed_2_numpy(embed_dict,
                                                          embed=embed)
    else:
        embed = np.float32(
            np.random.uniform(-0.2, 0.2, [
                share_input_conf['vocab_size'], share_input_conf['embed_size']
            ]))
        share_input_conf['embed'] = embed
    print '[Embedding] Embedding Load Done.'

    # list all input tags and construct tags config
    input_train_conf = OrderedDict()
    input_eval_conf = OrderedDict()
    for tag in input_conf.keys():
        if 'phase' not in input_conf[tag]:
            continue
        if input_conf[tag]['phase'] == 'TRAIN':
            input_train_conf[tag] = {}
            input_train_conf[tag].update(share_input_conf)
            input_train_conf[tag].update(input_conf[tag])
        elif input_conf[tag]['phase'] == 'EVAL':
            input_eval_conf[tag] = {}
            input_eval_conf[tag].update(share_input_conf)
            input_eval_conf[tag].update(input_conf[tag])
    print '[Input] Process Input Tags. %s in TRAIN, %s in EVAL.' % (
        input_train_conf.keys(), input_eval_conf.keys())

    # collect dataset identification
    dataset = {}
    for tag in input_conf:
        if tag != 'share' and input_conf[tag]['phase'] == 'PREDICT':
            continue
        if 'text1_corpus' in input_conf[tag]:
            datapath = input_conf[tag]['text1_corpus']
            if datapath not in dataset:
                dataset[datapath], _ = read_data(datapath)
        if 'text2_corpus' in input_conf[tag]:
            datapath = input_conf[tag]['text2_corpus']
            if datapath not in dataset:
                dataset[datapath], _ = read_data(datapath)
    print '[Dataset] %s Dataset Load Done.' % len(dataset)

    # initial data generator
    train_gen = OrderedDict()
    eval_gen = OrderedDict()

    for tag, conf in input_train_conf.items():
        print conf
        conf['data1'] = dataset[conf['text1_corpus']]
        conf['data2'] = dataset[conf['text2_corpus']]
        generator = inputs.get(conf['input_type'])
        train_gen[tag] = generator(config=conf)

    for tag, conf in input_eval_conf.items():
        print conf
        conf['data1'] = dataset[conf['text1_corpus']]
        conf['data2'] = dataset[conf['text2_corpus']]
        generator = inputs.get(conf['input_type'])
        eval_gen[tag] = generator(config=conf)

    ######### Load Model #########
    model = load_model(config)

    loss = []
    for lobj in config['losses']:
        if lobj['object_name'] in mz_specialized_losses:
            loss.append(
                rank_losses.get(lobj['object_name'])(lobj['object_params']))
        else:
            loss.append(rank_losses.get(lobj['object_name']))
    eval_metrics = OrderedDict()
    for mobj in config['metrics']:
        mobj = mobj.lower()
        if '@' in mobj:
            mt_key, mt_val = mobj.split('@', 1)
            eval_metrics[mobj] = metrics.get(mt_key)(int(mt_val))
        else:
            eval_metrics[mobj] = metrics.get(mobj)
    model.compile(optimizer=optimizer, loss=loss)
    print '[Model] Model Compile Done.'

    for i_e in range(num_iters):
        for tag, generator in train_gen.items():
            genfun = generator.get_batch_generator()
            print '[%s]\t[Train:%s]' % (time.strftime(
                '%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag),
            history = model.fit_generator(genfun,
                                          steps_per_epoch=display_interval,
                                          epochs=1,
                                          shuffle=False,
                                          verbose=0)  #callbacks=[eval_map])
            print 'Iter:%d\tloss=%.6f' % (i_e, history.history['loss'][0])

        for tag, generator in eval_gen.items():
            genfun = generator.get_batch_generator()
            print '[%s]\t[Eval:%s]' % (time.strftime(
                '%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag),
            res = dict([[k, 0.] for k in eval_metrics.keys()])
            num_valid = 0
            for input_data, y_true in genfun:
                y_pred = model.predict(input_data, batch_size=len(y_true))
                if issubclass(type(generator),
                              inputs.list_generator.ListBasicGenerator):
                    list_counts = input_data['list_counts']
                    for k, eval_func in eval_metrics.items():
                        for lc_idx in range(len(list_counts) - 1):
                            pre = list_counts[lc_idx]
                            suf = list_counts[lc_idx + 1]
                            res[k] += eval_func(y_true=y_true[pre:suf],
                                                y_pred=y_pred[pre:suf])
                    num_valid += len(list_counts) - 1
                else:
                    for k, eval_func in eval_metrics.items():
                        res[k] += eval_func(y_true=y_true, y_pred=y_pred)
                    num_valid += 1
            generator.reset()
            print 'Iter:%d\t%s' % (i_e, '\t'.join(
                ['%s=%f' % (k, v / num_valid) for k, v in res.items()]))
            sys.stdout.flush()
        if (i_e + 1) % save_weights_iters == 0:
            model.save_weights(weights_file % (i_e + 1))
예제 #20
0
def predict(config):
    print(json.dumps(config, indent=2))

    # read basic config
    global_conf = config["global"]
    optimizer = global_conf['optimizer']

    # read input config
    input_conf = config['inputs']
    share_input_conf = input_conf['share']

    # collect embedding
    assert 'embed_path' in share_input_conf
    embed_dict, vocab_size, embed_size, word_dict, idf_dict = read_embedding(
        share_input_conf['embed_path'])
    share_input_conf['word_dict'] = word_dict

    share_input_conf['vocab_size'] = vocab_size
    share_input_conf['embed_size'] = embed_size
    embed = np.float32(np.random.uniform(-9, 9, [vocab_size, embed_size]))
    embed_normalize = False
    if 'drmm' in config['model']['model_py'].lower():
        embed_normalize = True
    share_input_conf['embed'] = convert_embed_2_numpy(
        'embed', embed_dict=embed_dict, embed=embed, normalize=embed_normalize)
    idf = np.float32(np.random.uniform(4, 9, [vocab_size, 1]))
    share_input_conf['idf_feat'] = convert_embed_2_numpy('idf',
                                                         embed_dict=idf_dict,
                                                         embed=idf,
                                                         normalize=False)
    print '[%s]' % time.strftime(
        "%Y-%m-%d %H:%M:%S",
        time.localtime()), '[Embedding] Embedding Load Done.'

    # list all input tags and construct tags config
    input_eval_conf = OrderedDict()
    for tag in input_conf.keys():
        if 'phase' not in input_conf[tag]:
            continue
        if input_conf[tag]['phase'] == 'TRAIN':
            continue
        elif input_conf[tag]['phase'] == 'EVAL':
            input_eval_conf[tag] = {}
            input_eval_conf[tag].update(share_input_conf)
            input_eval_conf[tag].update(input_conf[tag])
    print '[%s]' % time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
    print '[Input] Process Input Tags. %s in EVAL.' % (input_eval_conf.keys())

    # initial data generator
    eval_gen = OrderedDict()

    for tag, conf in input_eval_conf.items():
        generator = inputs.get(conf['input_type'])
        eval_gen[tag] = generator(config=conf)

    ######### Load Model #########
    _model = load_model(config)
    # model = multi_gpu_model(_model, gpus=2)
    model = _model

    if 'load_weights_path' in global_conf:
        model.load_weights(global_conf['load_weights_path'])
    else:
        print 'no load_weights_path'
        exit(0)

    loss = []
    for lobj in config['losses']:
        loss.append(rank_losses.get(lobj))
    eval_metrics = OrderedDict()
    for mobj in config['metrics']:
        mobj = mobj.lower()
        if '@' in mobj:
            mt_key, mt_val = mobj.split('@', 1)
            eval_metrics[mobj] = metrics.get(mt_key)(int(mt_val))
        else:
            eval_metrics[mobj] = metrics.get(mobj)
    model.compile(optimizer=optimizer, loss=loss)
    print '[%s]' % time.strftime(
        "%Y-%m-%d %H:%M:%S", time.localtime()), '[Model] Model Compile Done.\n'

    print '\n### Model Info ###'
    model.summary()
    print '### Model Info ###\n'

    for i_e in range(1):
        for tag, generator in eval_gen.items():
            output_dir = config['net_name'].split('_')[0]
            output = open(
                '../output/%s/%s_%s_predict_output_%s.txt' %
                (output_dir, config['net_name'], tag, str(i_e + 1)), 'w')
            qid_uid_rel_score = {}
            qid_uid_score = {}
            genfun = generator.get_batch_generator()

            for input_data, y_true, curr_batch in genfun:
                y_pred = model.predict(input_data, batch_size=len(y_true))
                y_pred_reshape = np.reshape(y_pred, (len(y_pred), ))
                # output the predict scores
                for (q, d, label), score in zip(curr_batch, y_pred_reshape):
                    output.write('%s\t%s\t%s\t%s\n' %
                                 (str(q), str(d), str(label), str(score)))

                    if q not in qid_uid_score:
                        qid_uid_score[q] = {}
                    qid_uid_score[q][d] = score

                    if q not in qid_uid_rel_score:
                        qid_uid_rel_score[q] = dict(label=list(), score=list())
                    qid_uid_rel_score[q]['label'].append(label)
                    qid_uid_rel_score[q]['score'].append(score)

            output.close()
            # calculate the metrices
            res = dict([[k, 0.] for k in eval_metrics.keys()])
            for k, eval_func in eval_metrics.items():
                for qid in qid_uid_rel_score:
                    res[k] += eval_func(y_true=qid_uid_rel_score[qid]['label'],
                                        y_pred=qid_uid_rel_score[qid]['score'])
                res[k] /= len(qid_uid_rel_score)

            if 'test' in tag:
                print '[%s]' % time.strftime("%Y-%m-%d %H:%M:%S",
                                             time.localtime()),
                print '[Eval] @ epoch: %d,' % (i_e + 1),
                print ', '.join(['%s: %.5f' % (k, res[k]) for k in res])
            else:
                # calculate the eval_loss
                all_pairs = generator.get_all_pairs()
                all_pairs_rel_score = {}
                for qid, dp_id, dn_id in all_pairs:
                    all_pairs_rel_score[(qid, dp_id, dn_id)] = {}
                    all_pairs_rel_score[(qid, dp_id, dn_id)]['score'] = [
                        qid_uid_score[qid][dp_id], qid_uid_score[qid][dn_id]
                    ]
                    all_pairs_rel_score[(qid, dp_id,
                                         dn_id)]['rel'] = all_pairs[(qid,
                                                                     dp_id,
                                                                     dn_id)]

                eval_loss = cal_eval_loss(all_pairs_rel_score, tag,
                                          config['losses'])

                print '[%s]' % time.strftime("%Y-%m-%d %H:%M:%S",
                                             time.localtime()),
                print '[Eval] @ epoch: %d,' % (i_e + 1),
                print ', '.join(
                    ['%s: %.5f' % (k, eval_loss[k]) for k in eval_loss]),
                print ', '.join(['%s: %.5f' % (k, res[k]) for k in res])

        print ''
예제 #21
0
def predict(config):
    ######## Read input config ########

    print(json.dumps(config, indent=2), end='\n')
    input_conf = config['inputs']
    share_input_conf = input_conf['share']

    # collect embedding
    if 'embed_path' in share_input_conf:
        embed_dict = read_embedding(filename=share_input_conf['embed_path'])
        _PAD_ = share_input_conf['vocab_size'] - 1
        embed_dict[_PAD_] = np.zeros((share_input_conf['embed_size'], ),
                                     dtype=np.float32)
        embed = np.float32(
            np.random.uniform(-0.02, 0.02, [
                share_input_conf['vocab_size'], share_input_conf['embed_size']
            ]))
        share_input_conf['embed'] = convert_embed_2_numpy(embed_dict,
                                                          embed=embed)
    else:
        embed = np.float32(
            np.random.uniform(-0.2, 0.2, [
                share_input_conf['vocab_size'], share_input_conf['embed_size']
            ]))
        share_input_conf['embed'] = embed
    print('[Embedding] Embedding Load Done.', end='\n')

    # list all input tags and construct tags config
    input_predict_conf = OrderedDict()
    for tag in input_conf.keys():
        if 'phase' not in input_conf[tag]:
            continue
        if input_conf[tag]['phase'] == 'PREDICT':
            input_predict_conf[tag] = {}
            input_predict_conf[tag].update(share_input_conf)
            input_predict_conf[tag].update(input_conf[tag])
    print('[Input] Process Input Tags. %s in PREDICT.' %
          (input_predict_conf.keys()),
          end='\n')

    # collect dataset identification
    dataset = {}
    for tag in input_conf:
        if tag == 'share' or input_conf[tag]['phase'] == 'PREDICT':
            if 'text1_corpus' in input_conf[tag]:
                datapath = input_conf[tag]['text1_corpus']
                if datapath not in dataset:
                    dataset[datapath], _ = read_data(datapath)
            if 'text2_corpus' in input_conf[tag]:
                datapath = input_conf[tag]['text2_corpus']
                if datapath not in dataset:
                    dataset[datapath], _ = read_data(datapath)
    print('[Dataset] %s Dataset Load Done.' % len(dataset), end='\n')

    # initial data generator
    predict_gen = OrderedDict()

    for tag, conf in input_predict_conf.items():
        print(conf, end='\n')
        conf['data1'] = dataset[conf['text1_corpus']]
        conf['data2'] = dataset[conf['text2_corpus']]
        generator = inputs.get(conf['input_type'])
        predict_gen[tag] = generator(
            #data1 = dataset[conf['text1_corpus']],
            #data2 = dataset[conf['text2_corpus']],
            config=conf)

    ######## Read output config ########
    output_conf = config['outputs']

    ######## Load Model ########
    global_conf = config["global"]
    weights_file = str(global_conf['weights_file']) + '.' + str(
        global_conf['test_weights_iters'])

    zmodel, kmodel = load_model(config)

    # test y_pred from zoo model and keras model
    # keras2_y_pred = kmodel.predict(input_data, batch_size=batch_size)
    # y_pred = model.forward(input_data)
    # # y_pred = model.predict(input_data, distributed=False)
    # equal = np.allclose(y_pred, keras2_y_pred, rtol=1e-5, atol=1e-5)
    # print(equal)
    # return y_pred

    eval_metrics = OrderedDict()
    for mobj in config['metrics']:
        mobj = mobj.lower()
        if '@' in mobj:
            mt_key, mt_val = mobj.split('@', 1)
            eval_metrics[mobj] = metrics.get(mt_key)(int(mt_val))
        else:
            eval_metrics[mobj] = metrics.get(mobj)
    res = dict([[k, 0.] for k in eval_metrics.keys()])

    # batch_size = 20
    # query_data = np.random.randint(0, 10000, [batch_size, 10])
    # doc_data = np.random.randint(0, 10000, [batch_size, 40])
    # input_data = [query_data, doc_data]
    # keras2_y_pred = keras2_model.predict(input_data, batch_size=batch_size)
    # y_pred = model.predict(input_data, distributed=False)
    # equal = np.allclose(y_pred, keras2_y_pred, rtol=1e-5, atol=1e-5)
    for tag, generator in predict_gen.items():
        genfun = generator.get_batch_generator()
        print('[%s]\t[Predict] @ %s ' % (time.strftime(
            '%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag),
              end='')
        num_valid = 0
        res_scores = {}
        for input_data, y_true in genfun:
            ky_pred = kmodel.predict(input_data, batch_size=len(y_true))
            names = ['query', 'doc']
            shapes = [(None, 10), (None, 40)]
            list_input_data = _standardize_input_data(input_data,
                                                      names,
                                                      shapes,
                                                      check_batch_axis=False)
            # list_input_data = [data[0:2, :] for data in list_input_data]
            # y_pred = zmodel.predict(list_input_data, distributed=False)
            y_pred = zmodel.forward(list_input_data)
            equal = np.allclose(y_pred, ky_pred, rtol=1e-5, atol=1e-5)
            print(equal)

            if issubclass(type(generator),
                          inputs.list_generator.ListBasicGenerator):
                list_counts = input_data['list_counts']
                for k, eval_func in eval_metrics.items():
                    for lc_idx in range(len(list_counts) - 1):
                        pre = list_counts[lc_idx]
                        suf = list_counts[lc_idx + 1]
                        res[k] += eval_func(y_true=y_true[pre:suf],
                                            y_pred=y_pred[pre:suf])

                y_pred = np.squeeze(y_pred)
                for lc_idx in range(len(list_counts) - 1):
                    pre = list_counts[lc_idx]
                    suf = list_counts[lc_idx + 1]
                    for p, y, t in zip(input_data['ID'][pre:suf],
                                       y_pred[pre:suf], y_true[pre:suf]):
                        if p[0] not in res_scores:
                            res_scores[p[0]] = {}
                        res_scores[p[0]][p[1]] = (y, t)

                num_valid += len(list_counts) - 1
            else:
                for k, eval_func in eval_metrics.items():
                    res[k] += eval_func(y_true=y_true, y_pred=y_pred)
                for p, y, t in zip(input_data['ID'], y_pred, y_true):
                    if p[0] not in res_scores:
                        res_scores[p[0]] = {}
                    res_scores[p[0]][p[1]] = (y[1], t[1])
                num_valid += 1
        generator.reset()

        if tag in output_conf:
            if output_conf[tag]['save_format'] == 'TREC':
                with open(output_conf[tag]['save_path'], 'w') as f:
                    for qid, dinfo in res_scores.items():
                        dinfo = sorted(dinfo.items(),
                                       key=lambda d: d[1][0],
                                       reverse=True)
                        for inum, (did, (score, gt)) in enumerate(dinfo):
                            f.write('%s\tQ0\t%s\t%d\t%f\t%s\t%s\n' %
                                    (qid, did, inum, score, config['net_name'],
                                     gt))
            elif output_conf[tag]['save_format'] == 'TEXTNET':
                with open(output_conf[tag]['save_path'], 'w') as f:
                    for qid, dinfo in res_scores.items():
                        dinfo = sorted(dinfo.items(),
                                       key=lambda d: d[1][0],
                                       reverse=True)
                        for inum, (did, (score, gt)) in enumerate(dinfo):
                            f.write('%s %s %s %s\n' % (gt, qid, did, score))

        print('[Predict] results: ',
              '\t'.join(['%s=%f' % (k, v / num_valid)
                         for k, v in res.items()]),
              end='\n')
        sys.stdout.flush()
예제 #22
0
def train(config):

    print(json.dumps(config, indent=2), end='\n')
    # read basic config
    global_conf = config["global"]
    optimizer = global_conf['optimizer']
    optimizer = optimizers.get(optimizer)
    K.set_value(optimizer.lr, global_conf['learning_rate'])
    weights_file = str(global_conf['weights_file']) + '.%d'

    global logs_dir
    logs_dir = str(global_conf['logs'])
    if not os.path.exists(logs_dir):
        os.makedirs(logs_dir)

    display_interval = int(global_conf['display_interval'])
    num_iters = int(global_conf['num_iters'])
    save_weights_iters = int(global_conf['save_weights_iters'])

    # read input config
    input_conf = config['inputs']
    share_input_conf = input_conf['share']

    # collect embedding
    if 'embed_path' in share_input_conf:
        embed_dict = read_embedding(filename=share_input_conf['embed_path'])
        _PAD_ = share_input_conf['vocab_size'] - 1
        embed_dict[_PAD_] = np.zeros((share_input_conf['embed_size'], ),
                                     dtype=np.float32)
        embed = np.float32(
            np.random.uniform(-0.2, 0.2, [
                share_input_conf['vocab_size'], share_input_conf['embed_size']
            ]))
        share_input_conf['embed'] = convert_embed_2_numpy(embed_dict,
                                                          embed=embed)
    else:
        embed = np.float32(
            np.random.uniform(-0.2, 0.2, [
                share_input_conf['vocab_size'], share_input_conf['embed_size']
            ]))
        share_input_conf['embed'] = embed
    print('[Embedding] Embedding Load Done.', end='\n')

    # list all input tags and construct tags config
    input_train_conf = OrderedDict()
    input_eval_conf = OrderedDict()
    input_eval_loss_conf = OrderedDict()
    for tag in input_conf.keys():
        if 'phase' not in input_conf[tag]:
            continue
        if input_conf[tag]['phase'] == 'TRAIN':
            input_train_conf[tag] = {}
            input_train_conf[tag].update(share_input_conf)
            input_train_conf[tag].update(input_conf[tag])
            stats_for_plots[tag + '_loss'] = dict()
        elif input_conf[tag]['phase'] == 'EVAL':
            input_eval_conf[tag] = {}
            input_eval_conf[tag].update(share_input_conf)
            input_eval_conf[tag].update(input_conf[tag])
            stats_for_plots[tag] = dict()
        elif input_conf[tag]['phase'] == 'EVAL_LOSS':
            input_eval_loss_conf[tag] = {}
            input_eval_loss_conf[tag].update(share_input_conf)
            input_eval_loss_conf[tag].update(input_conf[tag])
            stats_for_plots[tag] = dict()
    print(
        '[Input] Process Input Tags. %s in TRAIN, %s in EVAL, %s in EVAL_LOSS.'
        % (input_train_conf.keys(), input_eval_conf.keys(),
           input_eval_loss_conf.keys()),
        end='\n')

    # collect dataset identification
    dataset = {}
    for tag in input_conf:
        if tag != 'share' and input_conf[tag]['phase'] == 'PREDICT':
            continue
        if 'text1_corpus' in input_conf[tag]:
            datapath = input_conf[tag]['text1_corpus']
            if datapath not in dataset:
                dataset[datapath], _ = read_data(datapath)
        if 'text2_corpus' in input_conf[tag]:
            datapath = input_conf[tag]['text2_corpus']
            if datapath not in dataset:
                dataset[datapath], _ = read_data(datapath)
    print('[Dataset] %s Dataset Load Done.' % len(dataset), end='\n')

    # initial data generator
    train_gen = OrderedDict()
    eval_gen = OrderedDict()
    eval_loss_gen = OrderedDict()

    for tag, conf in input_train_conf.items():
        print(conf, end='\n')
        conf['data1'] = dataset[conf['text1_corpus']]
        conf['data2'] = dataset[conf['text2_corpus']]
        generator = inputs.get(conf['input_type'])
        train_gen[tag] = generator(config=conf)

    for tag, conf in input_eval_conf.items():
        print(conf, end='\n')
        conf['data1'] = dataset[conf['text1_corpus']]
        conf['data2'] = dataset[conf['text2_corpus']]
        generator = inputs.get(conf['input_type'])
        eval_gen[tag] = generator(config=conf)

    for tag, conf in input_eval_loss_conf.items():
        print(conf, end='\n')
        conf['data1'] = dataset[conf['text1_corpus']]
        conf['data2'] = dataset[conf['text2_corpus']]
        generator = inputs.get(conf['input_type'])
        eval_loss_gen[tag] = generator(config=conf)

    ######### Load Model #########
    model = load_model(config)

    loss = []
    for lobj in config['losses']:
        if lobj['object_name'] in mz_specialized_losses:
            loss.append(
                rank_losses.get(lobj['object_name'])(lobj['object_params']))
        else:
            loss.append(rank_losses.get(lobj['object_name']))
        for k, v in stats_for_plots.items():
            if 'loss' in k:
                stats_for_plots[k][lobj['object_name']] = []
    eval_metrics = OrderedDict()
    for mobj in config['metrics']:
        mobj = mobj.lower()
        if '@' in mobj:
            mt_key, mt_val = mobj.split('@', 1)
            eval_metrics[mobj] = metrics.get(mt_key)(int(mt_val))
        else:
            eval_metrics[mobj] = metrics.get(mobj)
        for k, v in stats_for_plots.items():
            if 'loss' not in k:
                stats_for_plots[k][mobj] = []
    model.compile(optimizer=optimizer, loss=loss)
    print('[Model] Model Compile Done.', end='\n')

    for i_e in range(num_iters):
        for tag, generator in train_gen.items():
            genfun = generator.get_batch_generator()
            evalfun = eval_loss_gen['test_loss'].get_batch_generator()
            print('*' * 100)

            history = model.fit_generator(
                genfun,
                steps_per_epoch=display_interval,
                epochs=1,
                shuffle=False,
                verbose=0,
                validation_data=evalfun,
                validation_steps=display_interval,
                callbacks=[
                    TrainValTensorBoard(log_dir=os.path.join(
                        logs_dir, 'tensorboard'),
                                        global_step=display_interval * i_e,
                                        write_graph=False)
                ])  #callbacks=[eval_map])
            for k, v in stats_for_plots.items():
                if 'loss' in k:
                    print('[%s]\t[Train:%s] ' % (time.strftime(
                        '%m-%d-%Y %H:%M:%S', time.localtime(time.time())), k),
                          end='')
                    if any(srchstr in k
                           for srchstr in ('test', 'val', 'valid')):
                        _l = history.history['val_loss'][0]
                        stats_for_plots[k][lobj['object_name']].append(_l)
                    else:
                        _l = history.history['loss'][0]
                        stats_for_plots[k][lobj['object_name']].append(_l)
                    print('Iter:%d\t Loss =%.6f' % (i_e, _l), end='\n')
            print('-' * 50)

        for tag, generator in eval_gen.items():
            genfun = generator.get_batch_generator()
            print('[%s]\t[Eval:%s] ' % (time.strftime(
                '%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag),
                  end='')
            res = dict([[k, 0.] for k in eval_metrics.keys()])
            num_valid = 0
            #history_eval = model.evaluate_generator(genfun, steps=1)
            #print("history_eval: {}".format(history_eval))
            for input_data, y_true in genfun:
                y_pred = model.predict(input_data, batch_size=len(y_true))
                if issubclass(type(generator),
                              inputs.list_generator.ListBasicGenerator):
                    list_counts = input_data['list_counts']
                    for k, eval_func in eval_metrics.items():
                        for lc_idx in range(len(list_counts) - 1):
                            pre = list_counts[lc_idx]
                            suf = list_counts[lc_idx + 1]
                            res[k] += eval_func(y_true=y_true[pre:suf],
                                                y_pred=y_pred[pre:suf])
                    num_valid += len(list_counts) - 1
                else:
                    for k, eval_func in eval_metrics.items():
                        res[k] += eval_func(y_true=y_true, y_pred=y_pred)
                    num_valid += 1
            generator.reset()
            for k, v in res.items():
                stats_for_plots[tag][k].append(v / num_valid)

            print('Iter:%d\t%s' % (i_e, '\t'.join(
                ['%s=%f' % (k, v / num_valid) for k, v in res.items()])),
                  end='\n')
            sys.stdout.flush()
        if (i_e + 1) % save_weights_iters == 0:
            model.save_weights(weights_file % (i_e + 1))
    export_loss(False, 1)
    export_metrics(False, 1)