def load_eval_metrics(config): eval_metrics = OrderedDict() for mobj in config['metrics']: mobj = mobj.lower() if '@' in mobj: mt_key, mt_val = mobj.split('@', 1) eval_metrics[mobj] = metrics.get(mt_key)(int(mt_val)) else: eval_metrics[mobj] = metrics.get(mobj) return eval_metrics
def similar_items(self, item, metric='euclidean', n=50): # Metric jump table metrics = { 'euclidean': metrics.euclidean_distance, 'pearson': metrics.pearson_correlation, } distance = metrics.get(metric, None) ratings = pd.DataFrame(list(self.rating_service.get_all())) # Handle problems that might occur if item not in ratings['movie_id']: raise KeyError("Unknown item, '%s'." % item) if not distance or not callable(distance): raise KeyError("Unknown or unprogrammed distance metric '%s'." % metric) similar_items = {} for similar_item in ratings['movie_id']: if similar_item == item: continue similar_items[similar_item] = distance( self.recommender_service.get_shared_preferences( similar_item['user_id'], item['user_id'])) return heapq.nlargest(n, items.items(), key=itemgetter(1))
def train(config, para_path): print(json.dumps(config, indent=2), end='\n') # read basic config global_conf = config["global"] optimizer = global_conf['optimizer'] optimizer = optimizers.get(optimizer) K.set_value(optimizer.lr, global_conf['learning_rate']) #weights_file = str(global_conf['weights_file']) + '.%d' display_interval = int(global_conf['display_interval']) num_iters = int(global_conf['num_iters']) save_weights_iters = int(global_conf['save_weights_iters']) # read input config input_conf = config['inputs'] share_input_conf = input_conf['share'] #an_config = json.load(open('./data/pinfo/config.py', 'r')) an_config = json.load(open(para_path, 'r')) dstdir = an_config["model_dst_dir"] if not os.path.exists(an_config['weights_dir']): os.mkdir(an_config['weights_dir']) weights_file = an_config['weights_dir'] + str(global_conf['weights_file']) config['metrics'] = an_config["metrics"] config['model']['model_path'] = an_config['model_path'] share_input_conf['embed_path'] = dstdir + "embed_glove_d300" share_input_conf[ 'word_triletter_map_file'] = dstdir + "word_triletter_map.txt" share_input_conf['vocab_size'] = word_len(dstdir + "word_dict.txt") share_input_conf['text1_corpus'] = dstdir + "corpus_preprocessed.txt" share_input_conf['text2_corpus'] = dstdir + "corpus_preprocessed.txt" input_conf['train']['relation_file'] = dstdir + "relation_train.txt" input_conf['valid']['relation_file'] = dstdir + "relation_valid.txt" input_conf['test']['relation_file'] = dstdir + "relation_test.txt" input_conf['train'][ 'hist_feats_file'] = dstdir + "relation_train.binsum-20.txt" input_conf['valid'][ 'hist_feats_file'] = dstdir + "relation_valid.binsum-20.txt" input_conf['test'][ 'hist_feats_file'] = dstdir + "relation_test.binsum-20.txt" # collect embedding if 'embed_path' in share_input_conf: embed_dict = read_embedding(filename=share_input_conf['embed_path']) _PAD_ = share_input_conf['vocab_size'] - 1 embed_dict[_PAD_] = np.zeros((share_input_conf['embed_size'], ), dtype=np.float32) embed = np.float32( np.random.uniform(-0.2, 0.2, [ share_input_conf['vocab_size'], share_input_conf['embed_size'] ])) share_input_conf['embed'] = convert_embed_2_numpy(embed_dict, embed=embed) else: # if no embed provided, use random embed = np.float32( np.random.uniform(-0.2, 0.2, [ share_input_conf['vocab_size'], share_input_conf['embed_size'] ])) share_input_conf['embed'] = embed print('[Embedding] Embedding Load Done.', end='\n') # list all input tags and construct tags config input_train_conf = OrderedDict() input_eval_conf = OrderedDict() # print("input_conf", input_conf) # print("input_conf keys", input_conf.keys()) for tag in input_conf.keys(): if 'phase' not in input_conf[tag]: continue if input_conf[tag]['phase'] == 'TRAIN': input_train_conf[tag] = {} input_train_conf[tag].update(share_input_conf) input_train_conf[tag].update(input_conf[tag]) elif input_conf[tag]['phase'] == 'EVAL': input_eval_conf[tag] = {} input_eval_conf[tag].update(share_input_conf) input_eval_conf[tag].update(input_conf[tag]) print('[Input] Process Input Tags. %s in TRAIN, %s in EVAL.' % (input_train_conf.keys(), input_eval_conf.keys()), end='\n') # print("input_train_conf", input_train_conf) # collect dataset identification dataset = {} for tag in input_conf: if tag != 'share' and input_conf[tag]['phase'] == 'PREDICT': continue if 'text1_corpus' in input_conf[tag]: datapath = input_conf[tag]['text1_corpus'] if datapath not in dataset: dataset[datapath], _ = read_data(datapath) if 'text2_corpus' in input_conf[tag]: datapath = input_conf[tag]['text2_corpus'] if datapath not in dataset: dataset[datapath], _ = read_data(datapath) print('[Dataset] %s Dataset Load Done.' % len(dataset), end='\n') # initial data generator train_gen = OrderedDict() eval_gen = OrderedDict() for tag, conf in input_train_conf.items(): print(conf, end='\n') conf['data1'] = dataset[conf['text1_corpus']] conf['data2'] = dataset[conf['text2_corpus']] generator = inputs.get(conf['input_type']) train_gen[tag] = generator(config=conf) for tag, conf in input_eval_conf.items(): print(conf, end='\n') conf['data1'] = dataset[conf['text1_corpus']] conf['data2'] = dataset[conf['text2_corpus']] generator = inputs.get(conf['input_type']) eval_gen[tag] = generator(config=conf) ######### Load Model ######### model = load_model(config) # weights_file1 = str(global_conf['weights_file']) + '.' + str(global_conf['test_weights_iters']) # model.load_weights(weights_file1) loss = [] for lobj in config['losses']: if lobj['object_name'] in mz_specialized_losses: loss.append( rank_losses.get(lobj['object_name'])(lobj['object_params'])) else: loss.append(rank_losses.get(lobj['object_name'])) eval_metrics = OrderedDict() for mobj in config['metrics']: mobj = mobj.lower() if '@' in mobj: mt_key, mt_val = mobj.split('@', 1) eval_metrics[mobj] = metrics.get(mt_key)(int(mt_val)) else: eval_metrics[mobj] = metrics.get(mobj) model.compile(optimizer=optimizer, loss=loss) print('[Model] Model Compile Done.', end='\n') base_metric = an_config["base_metric"] best_epoch = 0 best_metric = -1 best_result = '' start_time = time.clock() for i_e in range(num_iters): for tag, generator in train_gen.items(): genfun = generator.get_batch_generator() print('[%s]\t[Train:%s] ' % (time.strftime( '%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag), end='') history = model.fit_generator(genfun, steps_per_epoch=display_interval, epochs=1, shuffle=False, verbose=0) #callbacks=[eval_map]) print('Iter:%d\tloss=%.6f' % (i_e, history.history['loss'][0]), end='\n') for tag, generator in eval_gen.items(): genfun = generator.get_batch_generator() print('[%s]\t[Eval:%s] ' % (time.strftime( '%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag), end='') res = dict([[k, 0.] for k in eval_metrics.keys()]) num_valid = 0 for input_data, y_true in genfun: y_pred = model.predict(input_data, batch_size=len(y_true)) if issubclass(type(generator), inputs.list_generator.ListBasicGenerator): list_counts = input_data['list_counts'] for k, eval_func in eval_metrics.items(): for lc_idx in range(len(list_counts) - 1): pre = list_counts[lc_idx] suf = list_counts[lc_idx + 1] res[k] += eval_func(y_true=y_true[pre:suf], y_pred=y_pred[pre:suf]) num_valid += len(list_counts) - 1 else: for k, eval_func in eval_metrics.items(): res[k] += eval_func(y_true=y_true, y_pred=y_pred) num_valid += 1 generator.reset() print('Iter:%d\t%s' % (i_e, '\t'.join( ['%s=%f' % (k, v / num_valid) for k, v in res.items()])), end='\n') cur_metric = res[base_metric] / num_valid cur_res = '\t'.join( ['%s=%f' % (k, v / num_valid) for k, v in res.items()]) cur_metric_ls = {} for k, v in res.items(): cur_metric_ls[k] = round(v / num_valid, 4) if cur_metric > best_metric and tag == 'valid': best_epoch = i_e best_metric = cur_metric best_result = cur_res best_metric_ls = cur_metric_ls model.save_weights(weights_file) sys.stdout.flush() #if (i_e+1) % save_weights_iters == 0: #model.save_weights(weights_file % (i_e+1)) end_time = time.clock() print('the best running result %s the best epoch %d' % (best_result, best_epoch)) print('the running time %s seconds' % (end_time - start_time)) db.insert_result(task_id, model_id, best_metric_ls)
def predict(config): ######## Read input config ######## print(json.dumps(config, indent=2), end='\n') input_conf = config['inputs'] share_input_conf = input_conf['share'] # collect embedding if 'embed_path' in share_input_conf: embed_dict = read_embedding(filename=share_input_conf['embed_path']) _PAD_ = share_input_conf['vocab_size'] - 1 embed_dict[_PAD_] = np.zeros((share_input_conf['embed_size'], ), dtype=np.float32) embed = np.float32( np.random.uniform(-0.02, 0.02, [ share_input_conf['vocab_size'], share_input_conf['embed_size'] ])) share_input_conf['embed'] = convert_embed_2_numpy(embed_dict, embed=embed) else: embed = np.float32( np.random.uniform(-0.2, 0.2, [ share_input_conf['vocab_size'], share_input_conf['embed_size'] ])) share_input_conf['embed'] = embed print('[Embedding] Embedding Load Done.', end='\n') # list all input tags and construct tags config input_predict_conf = OrderedDict() for tag in input_conf.keys(): if 'phase' not in input_conf[tag]: continue if input_conf[tag]['phase'] == 'PREDICT': input_predict_conf[tag] = {} input_predict_conf[tag].update(share_input_conf) input_predict_conf[tag].update(input_conf[tag]) print('[Input] Process Input Tags. %s in PREDICT.' % (input_predict_conf.keys()), end='\n') # collect dataset identification dataset = {} for tag in input_conf: if tag == 'share' or input_conf[tag]['phase'] == 'PREDICT': if 'text1_corpus' in input_conf[tag]: datapath = input_conf[tag]['text1_corpus'] if datapath not in dataset: dataset[datapath], _ = read_data(datapath) if 'text2_corpus' in input_conf[tag]: datapath = input_conf[tag]['text2_corpus'] if datapath not in dataset: dataset[datapath], _ = read_data(datapath) print('[Dataset] %s Dataset Load Done.' % len(dataset), end='\n') # initial data generator predict_gen = OrderedDict() for tag, conf in input_predict_conf.items(): print(conf, end='\n') conf['data1'] = dataset[conf['text1_corpus']] conf['data2'] = dataset[conf['text2_corpus']] generator = inputs.get(conf['input_type']) predict_gen[tag] = generator( #data1 = dataset[conf['text1_corpus']], #data2 = dataset[conf['text2_corpus']], config=conf) ######## Read output config ######## output_conf = config['outputs'] ######## Load Model ######## global_conf = config["global"] weights_file = str(global_conf['weights_file']) + '.' + str( global_conf['test_weights_iters']) model = load_model(config) model.load_weights(weights_file) encoder = Model(inputs=model.input, outputs=[ model.get_layer('att_layer_2').output, model.get_layer('att_layer_3').output ]) # encoder = Model(inputs=model.input, outputs=model.get_layer('att_layer_2').output) eval_metrics = OrderedDict() for mobj in config['metrics']: mobj = mobj.lower() if '@' in mobj: mt_key, mt_val = mobj.split('@', 1) eval_metrics[mobj] = metrics.get(mt_key)(int(mt_val)) else: eval_metrics[mobj] = metrics.get(mobj) res = dict([[k, 0.] for k in eval_metrics.keys()]) # 这是为了打印query和sent的attention score for tag, generator in predict_gen.items(): genfun = generator.get_batch_generator() print('[%s]\t[Predict] @ %s ' % (time.strftime( '%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag), end='') num_valid = 0 res_scores = {} for input_data, y_true in genfun: y_pred1, y_pred2 = encoder.predict(input_data, batch_size=len(y_true)) y_pred1 = _to_list(np.squeeze(y_pred1).tolist()) y_pred2 = _to_list(np.squeeze(y_pred2).tolist()) # print("y_pred", len(y_pred), len(y_pred[0])) print(input_data) print("sent", y_pred1) print("query", y_pred2) print() for tag, generator in predict_gen.items(): genfun = generator.get_batch_generator() print('[%s]\t[Predict] @ %s ' % (time.strftime( '%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag), end='') num_valid = 0 res_scores = {} for input_data, y_true in genfun: y_pred = model.predict(input_data, batch_size=len(y_true)) if issubclass(type(generator), inputs.list_generator.ListBasicGenerator): list_counts = input_data['list_counts'] for k, eval_func in eval_metrics.items(): for lc_idx in range(len(list_counts) - 1): pre = list_counts[lc_idx] suf = list_counts[lc_idx + 1] res[k] += eval_func(y_true=y_true[pre:suf], y_pred=y_pred[pre:suf]) y_pred = np.squeeze(y_pred) for lc_idx in range(len(list_counts) - 1): pre = list_counts[lc_idx] suf = list_counts[lc_idx + 1] for p, y, t in zip(input_data['ID'][pre:suf], y_pred[pre:suf], y_true[pre:suf]): if p[0] not in res_scores: res_scores[p[0]] = {} res_scores[p[0]][p[1]] = (y, t) num_valid += len(list_counts) - 1 else: for k, eval_func in eval_metrics.items(): res[k] += eval_func(y_true=y_true, y_pred=y_pred) for p, y, t in zip(input_data['ID'], y_pred, y_true): if p[0] not in res_scores: res_scores[p[0]] = {} res_scores[p[0]][p[1]] = (y[1], t[1]) num_valid += 1 generator.reset() if tag in output_conf: if output_conf[tag]['save_format'] == 'TREC': with open(output_conf[tag]['save_path'], 'w') as f: for qid, dinfo in res_scores.items(): dinfo = sorted(dinfo.items(), key=lambda d: d[1][0], reverse=True) for inum, (did, (score, gt)) in enumerate(dinfo): f.write('%s\tQ0\t%s\t%d\t%f\t%s\t%s\n' % (qid, did, inum, score, config['net_name'], gt)) elif output_conf[tag]['save_format'] == 'TEXTNET': with open(output_conf[tag]['save_path'], 'w') as f: for qid, dinfo in res_scores.items(): dinfo = sorted(dinfo.items(), key=lambda d: d[1][0], reverse=True) for inum, (did, (score, gt)) in enumerate(dinfo): f.write('%s %s %s %s\n' % (gt, qid, did, score)) print('[Predict] results: ', '\t'.join(['%s=%f' % (k, v / num_valid) for k, v in res.items()]), end='\n') sys.stdout.flush()
model = ANMM(model_config).build() loss = [] for lobj in config['losses']: if lobj['object_name'] in mz_specialized_losses: loss.append( rank_losses.get(lobj['object_name'])(lobj['object_params'])) else: loss.append(rank_losses.get(lobj['object_name'])) eval_metrics = OrderedDict() for mobj in config['metrics']: mobj = mobj.lower() if '@' in mobj: mt_key, mt_val = mobj.split('@', 1) eval_metrics[mobj] = metrics.get(mt_key)(int(mt_val)) else: eval_metrics[mobj] = metrics.get(mobj) model.compile(optimizer=optimizer, loss=loss) print('[Model] Model Compile Done.', end='\n') def evaluate(): i = 0 for tag, generator in eval_gen.items(): if (i == 1): continue genfun = generator.get_batch_generator() print('[%s]\t[Eval:%s] ' % (time.strftime( '%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag), end='')
def predict(config): print(json.dumps(config, indent=2)) # read basic config global_conf = config["global"] optimizer = global_conf['optimizer'] # read input config input_conf = config['inputs'] share_input_conf = input_conf['share'] # collect embedding assert 'embed_path' in share_input_conf embed_dict, vocab_size, embed_size, word_dict, idf_dict = read_embedding( share_input_conf['embed_path']) share_input_conf['word_dict'] = word_dict share_input_conf['vocab_size'] = vocab_size share_input_conf['embed_size'] = embed_size embed = np.float32(np.random.uniform(-9, 9, [vocab_size, embed_size])) embed_normalize = False if 'drmm' in config['model']['model_py'].lower(): embed_normalize = True share_input_conf['embed'] = convert_embed_2_numpy( 'embed', embed_dict=embed_dict, embed=embed, normalize=embed_normalize) idf = np.float32(np.random.uniform(4, 9, [vocab_size, 1])) share_input_conf['idf_feat'] = convert_embed_2_numpy('idf', embed_dict=idf_dict, embed=idf, normalize=False) print '[%s]' % time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime()), '[Embedding] Embedding Load Done.' # list all input tags and construct tags config input_eval_conf = OrderedDict() for tag in input_conf.keys(): if 'phase' not in input_conf[tag]: continue if input_conf[tag]['phase'] == 'TRAIN': continue elif input_conf[tag]['phase'] == 'EVAL': input_eval_conf[tag] = {} input_eval_conf[tag].update(share_input_conf) input_eval_conf[tag].update(input_conf[tag]) print '[%s]' % time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), print '[Input] Process Input Tags. %s in EVAL.' % (input_eval_conf.keys()) # initial data generator eval_gen = OrderedDict() for tag, conf in input_eval_conf.items(): generator = inputs.get(conf['input_type']) eval_gen[tag] = generator(config=conf) ######### Load Model ######### _model = load_model(config) # model = multi_gpu_model(_model, gpus=2) model = _model if 'load_weights_path' in global_conf: model.load_weights(global_conf['load_weights_path']) else: print 'no load_weights_path' exit(0) loss = [] for lobj in config['losses']: loss.append(rank_losses.get(lobj)) eval_metrics = OrderedDict() for mobj in config['metrics']: mobj = mobj.lower() if '@' in mobj: mt_key, mt_val = mobj.split('@', 1) eval_metrics[mobj] = metrics.get(mt_key)(int(mt_val)) else: eval_metrics[mobj] = metrics.get(mobj) model.compile(optimizer=optimizer, loss=loss) print '[%s]' % time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime()), '[Model] Model Compile Done.\n' print '\n### Model Info ###' model.summary() print '### Model Info ###\n' for i_e in range(1): for tag, generator in eval_gen.items(): output_dir = config['net_name'].split('_')[0] output = open( '../output/%s/%s_%s_predict_output_%s.txt' % (output_dir, config['net_name'], tag, str(i_e + 1)), 'w') qid_uid_rel_score = {} qid_uid_score = {} genfun = generator.get_batch_generator() for input_data, y_true, curr_batch in genfun: y_pred = model.predict(input_data, batch_size=len(y_true)) y_pred_reshape = np.reshape(y_pred, (len(y_pred), )) # output the predict scores for (q, d, label), score in zip(curr_batch, y_pred_reshape): output.write('%s\t%s\t%s\t%s\n' % (str(q), str(d), str(label), str(score))) if q not in qid_uid_score: qid_uid_score[q] = {} qid_uid_score[q][d] = score if q not in qid_uid_rel_score: qid_uid_rel_score[q] = dict(label=list(), score=list()) qid_uid_rel_score[q]['label'].append(label) qid_uid_rel_score[q]['score'].append(score) output.close() # calculate the metrices res = dict([[k, 0.] for k in eval_metrics.keys()]) for k, eval_func in eval_metrics.items(): for qid in qid_uid_rel_score: res[k] += eval_func(y_true=qid_uid_rel_score[qid]['label'], y_pred=qid_uid_rel_score[qid]['score']) res[k] /= len(qid_uid_rel_score) if 'test' in tag: print '[%s]' % time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), print '[Eval] @ epoch: %d,' % (i_e + 1), print ', '.join(['%s: %.5f' % (k, res[k]) for k in res]) else: # calculate the eval_loss all_pairs = generator.get_all_pairs() all_pairs_rel_score = {} for qid, dp_id, dn_id in all_pairs: all_pairs_rel_score[(qid, dp_id, dn_id)] = {} all_pairs_rel_score[(qid, dp_id, dn_id)]['score'] = [ qid_uid_score[qid][dp_id], qid_uid_score[qid][dn_id] ] all_pairs_rel_score[(qid, dp_id, dn_id)]['rel'] = all_pairs[(qid, dp_id, dn_id)] eval_loss = cal_eval_loss(all_pairs_rel_score, tag, config['losses']) print '[%s]' % time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), print '[Eval] @ epoch: %d,' % (i_e + 1), print ', '.join( ['%s: %.5f' % (k, eval_loss[k]) for k in eval_loss]), print ', '.join(['%s: %.5f' % (k, res[k]) for k in res]) print ''
def train(config): print(json.dumps(config, indent=2)) # read basic config global_conf = config["global"] optimizer = global_conf['optimizer'] weights_file = str(global_conf['weights_file']) + '.%d' display_interval = int(global_conf['display_interval']) num_iters = int(global_conf['num_iters']) save_weights_iters = int(global_conf['save_weights_iters']) is_save_weights = global_conf['is_save_weights'] # read input config input_conf = config['inputs'] share_input_conf = input_conf['share'] # prepare the corpus files and reference files for computing BLEU/ROUGE-L metrics corpus_file = share_input_conf['corpus_file'] test_ref_list = read_refs(share_input_conf['test_ref_file']) valid_ref_list = read_refs(share_input_conf['valid_ref_file']) corpus_dict = {} with open(corpus_file) as fin: for l in fin: tok = l.split(' ') corpus_dict[tok[0]] = ' '.join(tok[2:]) # collect embedding if 'embed_path' in share_input_conf: embed_dict = read_embedding(filename=share_input_conf['embed_path']) _PAD_ = share_input_conf['vocab_size'] - 1 embed_dict[_PAD_] = np.zeros((share_input_conf['embed_size'], ), dtype=np.float32) embed = np.float32( np.random.uniform(-0.2, 0.2, [ share_input_conf['vocab_size'], share_input_conf['embed_size'] ])) share_input_conf['embed'] = convert_embed_2_numpy(embed_dict, embed=embed) else: embed = np.float32( np.random.uniform(-0.2, 0.2, [ share_input_conf['vocab_size'], share_input_conf['embed_size'] ])) share_input_conf['embed'] = embed print '[Embedding] Embedding Load Done.' # list all input tags and construct tags config input_train_conf = OrderedDict() input_eval_conf = OrderedDict() for tag in input_conf.keys(): if 'phase' not in input_conf[tag]: continue if input_conf[tag]['phase'] == 'TRAIN': input_train_conf[tag] = {} input_train_conf[tag].update(share_input_conf) input_train_conf[tag].update(input_conf[tag]) elif input_conf[tag]['phase'] == 'EVAL': input_eval_conf[tag] = {} input_eval_conf[tag].update(share_input_conf) input_eval_conf[tag].update(input_conf[tag]) print '[Input] Process Input Tags. %s in TRAIN, %s in EVAL.' % ( input_train_conf.keys(), input_eval_conf.keys()) # collect dataset identification dataset = {} for tag in input_conf: if tag != 'share' and input_conf[tag]['phase'] == 'PREDICT': continue if 'text1_corpus' in input_conf[tag]: datapath = input_conf[tag]['text1_corpus'] if datapath not in dataset: dataset[datapath], _ = read_data(datapath) if 'text2_corpus' in input_conf[tag]: datapath = input_conf[tag]['text2_corpus'] if datapath not in dataset: dataset[datapath], _ = read_data(datapath) print '[Dataset] %s Dataset Load Done.' % len(dataset) # initial data generator train_gen = OrderedDict() eval_gen = OrderedDict() for tag, conf in input_train_conf.items(): print conf conf['data1'] = dataset[conf['text1_corpus']] conf['data2'] = dataset[conf['text2_corpus']] generator = inputs.get(conf['input_type']) train_gen[tag] = generator(config=conf) for tag, conf in input_eval_conf.items(): print conf conf['data1'] = dataset[conf['text1_corpus']] conf['data2'] = dataset[conf['text2_corpus']] generator = inputs.get(conf['input_type']) eval_gen[tag] = generator(config=conf) output_conf = config['outputs'] ######### Load Model ######### model = load_model(config) loss = [] for lobj in config['losses']: if lobj['object_name'] in mz_specialized_losses: loss.append( rank_losses.get(lobj['object_name'])(lobj['object_params'])) else: loss.append(rank_losses.get(lobj['object_name'])) eval_metrics = OrderedDict() for mobj in config['metrics']: mobj = mobj.lower() if '@' in mobj: mt_key, mt_val = mobj.split('@', 1) eval_metrics[mobj] = metrics.get(mt_key)(int(mt_val)) else: eval_metrics[mobj] = metrics.get(mobj) model.compile(optimizer=optimizer, loss=loss) print '[Model] Model Compile Done.' for i_e in range(num_iters): for tag, generator in train_gen.items(): genfun = generator.get_batch_generator() print '[%s]\t[Train:%s]' % (time.strftime( '%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag), history = model.fit_generator(genfun, steps_per_epoch=display_interval, epochs=1, shuffle=False, verbose=0) #callbacks=[eval_map]) print 'Iter:%d\tloss=%.6f' % (i_e, history.history['loss'][0]) for tag, generator in eval_gen.items(): #print('test tag: ', tag) genfun = generator.get_batch_generator() # print '[%s]\t[Eval:%s]' % (time.strftime('%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag), # res = dict([[k,0.] for k in eval_metrics.keys()]) res_scores = { } # 2D dict; key qid-did ;value: predict_score, ground_truth for input_data, y_true in genfun: y_pred = model.predict(input_data, batch_size=len(y_true)) if issubclass(type(generator), inputs.list_generator.ListBasicGenerator): list_counts = input_data[ 'list_counts'] # list_counts store the boundries between documents under different queries y_pred = np.squeeze(y_pred) for lc_idx in range(len(list_counts) - 1): pre = list_counts[lc_idx] suf = list_counts[lc_idx + 1] for p, y, t in zip(input_data['ID'][pre:suf], y_pred[pre:suf], y_true[pre:suf]): if p[0] not in res_scores: res_scores[p[0]] = {} res_scores[p[0]][p[1]] = (y, t) else: NameError('not supported in this version!') generator.reset() sys.stdout.flush() # save predicted score files for valid/test data if (i_e + 1) % save_weights_iters == 0: score_list = [] with open( output_conf['predict']['save_path_during_train'] + '-' + tag + '.' + str(i_e + 1), 'w') as f: for qid, dinfo in res_scores.items(): dinfo = sorted(dinfo.items(), key=lambda d: d[1][0], reverse=True) for inum, (did, (score, gt)) in enumerate(dinfo): score_l = '%s\tQ0\t%s\t%d\t%f\t%s\t%s' % ( qid, did, inum, score, config['net_name'], gt) print >> f, score_l score_list.append(score_l) # compute BLEU/ROUGE metrics at this check point ref_list = test_ref_list if tag == 'test' else valid_ref_list bleu_rouge_metrics = compute_bleu_rouge_given_scores_in_train( score_list, corpus_dict, ref_list, tag) print '[%s]\t[Eval:%s] Iter:%d\t(bleu1-4 corpus_bleu rougel dist1 dist2 avglen)\t%s' \ % (time.strftime('%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag, i_e+1, bleu_rouge_metrics) if ( i_e + 1 ) % save_weights_iters and is_save_weights == "1": # add an option to control saving weight files or not model.save_weights(weights_file % (i_e + 1))
def train(config): print(json.dumps(config, indent=2), end='\n') # read basic config global_conf = config["global"] weights_file = str(global_conf['weights_file']) + '.%d' display_interval = int(global_conf['display_interval']) num_iters = int(global_conf['num_iters']) save_weights_iters = int(global_conf['save_weights_iters']) # read input config input_conf = config['inputs'] share_input_conf = input_conf['share'] # collect embedding if 'embed_path' in share_input_conf: embed_dict = read_embedding(filename=share_input_conf['embed_path']) _PAD_ = share_input_conf['vocab_size'] - 1 embed_dict[_PAD_] = np.zeros((share_input_conf['embed_size'], ), dtype=np.float32) embed = np.float32( np.random.uniform(-0.2, 0.2, [ share_input_conf['vocab_size'], share_input_conf['embed_size'] ])) share_input_conf['embed'] = convert_embed_2_numpy(embed_dict, embed=embed) else: embed = np.float32( np.random.uniform(-0.2, 0.2, [ share_input_conf['vocab_size'], share_input_conf['embed_size'] ])) share_input_conf['embed'] = embed print('[Embedding] Embedding Load Done.', end='\n') # list all input tags and construct tags config input_train_conf = OrderedDict() input_eval_conf = OrderedDict() for tag in input_conf.keys(): if 'phase' not in input_conf[tag]: continue if input_conf[tag]['phase'] == 'TRAIN': input_train_conf[tag] = {} input_train_conf[tag].update(share_input_conf) input_train_conf[tag].update(input_conf[tag]) elif input_conf[tag]['phase'] == 'EVAL': input_eval_conf[tag] = {} input_eval_conf[tag].update(share_input_conf) input_eval_conf[tag].update(input_conf[tag]) print('[Input] Process Input Tags. %s in TRAIN, %s in EVAL.' % (input_train_conf.keys(), input_eval_conf.keys()), end='\n') # collect dataset identification dataset = {} for tag in input_conf: if tag != 'share' and input_conf[tag]['phase'] == 'PREDICT': continue if 'text1_corpus' in input_conf[tag]: datapath = input_conf[tag]['text1_corpus'] if datapath not in dataset: dataset[datapath], _ = read_data(datapath) if 'text2_corpus' in input_conf[tag]: datapath = input_conf[tag]['text2_corpus'] if datapath not in dataset: dataset[datapath], _ = read_data(datapath) print('[Dataset] %s Dataset Load Done.' % len(dataset), end='\n') # initial data generator train_gen = OrderedDict() eval_gen = OrderedDict() for tag, conf in input_train_conf.items(): print(conf, end='\n') conf['data1'] = dataset[conf['text1_corpus']] conf['data2'] = dataset[conf['text2_corpus']] generator = inputs.get(conf['input_type']) train_gen[tag] = generator(config=conf) for tag, conf in input_eval_conf.items(): print(conf, end='\n') conf['data1'] = dataset[conf['text1_corpus']] conf['data2'] = dataset[conf['text2_corpus']] generator = inputs.get(conf['input_type']) eval_gen[tag] = generator(config=conf) ######### Load Model ######### zmodel, kmodel = load_model(config) input = Input(name='input', shape=(2, 50)) timeDistributed = TimeDistributed(layer=zmodel, input_shape=(2, 50))(input) z_knrm_model = Model(input=input, output=timeDistributed) eval_metrics = OrderedDict() for mobj in config['metrics']: mobj = mobj.lower() if '@' in mobj: mt_key, mt_val = mobj.split('@', 1) eval_metrics[mobj] = metrics.get(mt_key)(int(mt_val)) else: eval_metrics[mobj] = metrics.get(mobj) epoch_num = 400 batch_size = 200 # take a look at the config batch_num_per_epoch = 10 #train_as_whole(z_knrm_model, zmodel, train_gen, eval_gen, eval_metrics) z_knrm_model.set_tensorboard("/tmp/matchzoo", "knrm-sgd-1e4") # train_per_epoch(z_knrm_model, zmodel, train_gen, eval_gen, eval_metrics, optimMethod=SGD(1e-4)) train_per_epoch(z_knrm_model, zmodel, train_gen, eval_gen, eval_metrics, optimMethod=SGD(1e-4, leaningrate_schedule=Poly(0.5, 50 * 400)))
def train(config): print(json.dumps(config, indent=2), end='\n') # read basic config global_conf = config["global"] optimizer = global_conf['optimizer'] optimizer=optimizers.get(optimizer) K.set_value(optimizer.lr, global_conf['learning_rate']) weights_file = str(global_conf['weights_file']) + '.%d' display_interval = int(global_conf['display_interval']) num_iters = int(global_conf['num_iters']) save_weights_iters = int(global_conf['save_weights_iters']) # read input config input_conf = config['inputs'] share_input_conf = input_conf['share'] # collect embedding if 'embed_path' in share_input_conf: embed_dict = read_embedding(filename=share_input_conf['embed_path']) _PAD_ = share_input_conf['vocab_size'] - 1 embed_dict[_PAD_] = np.zeros((share_input_conf['embed_size'], ), dtype=np.float32) embed = np.float32(np.random.uniform(-0.2, 0.2, [share_input_conf['vocab_size'], share_input_conf['embed_size']])) share_input_conf['embed'] = convert_embed_2_numpy(embed_dict, embed = embed) else: embed = np.float32(np.random.uniform(-0.2, 0.2, [share_input_conf['vocab_size'], share_input_conf['embed_size']])) share_input_conf['embed'] = embed print('[Embedding] Embedding Load Done.', end='\n') # list all input tags and construct tags config input_train_conf = OrderedDict() input_eval_conf = OrderedDict() for tag in input_conf.keys(): if 'phase' not in input_conf[tag]: continue if input_conf[tag]['phase'] == 'TRAIN': input_train_conf[tag] = {} input_train_conf[tag].update(share_input_conf) input_train_conf[tag].update(input_conf[tag]) elif input_conf[tag]['phase'] == 'EVAL': input_eval_conf[tag] = {} input_eval_conf[tag].update(share_input_conf) input_eval_conf[tag].update(input_conf[tag]) print('[Input] Process Input Tags. %s in TRAIN, %s in EVAL.' % (input_train_conf.keys(), input_eval_conf.keys()), end='\n') # collect dataset identification dataset = {} for tag in input_conf: if tag != 'share' and input_conf[tag]['phase'] == 'PREDICT': continue if 'text1_corpus' in input_conf[tag]: datapath = input_conf[tag]['text1_corpus'] if datapath not in dataset: dataset[datapath], _ = read_data(datapath) if 'text2_corpus' in input_conf[tag]: datapath = input_conf[tag]['text2_corpus'] if datapath not in dataset: dataset[datapath], _ = read_data(datapath) print('[Dataset] %s Dataset Load Done.' % len(dataset), end='\n') # initial data generator train_gen = OrderedDict() eval_gen = OrderedDict() for tag, conf in input_train_conf.items(): print(conf, end='\n') conf['data1'] = dataset[conf['text1_corpus']] conf['data2'] = dataset[conf['text2_corpus']] generator = inputs.get(conf['input_type']) train_gen[tag] = generator( config = conf ) for tag, conf in input_eval_conf.items(): print(conf, end='\n') conf['data1'] = dataset[conf['text1_corpus']] conf['data2'] = dataset[conf['text2_corpus']] generator = inputs.get(conf['input_type']) eval_gen[tag] = generator( config = conf ) ######### Load Model ######### model = load_model(config) loss = [] for lobj in config['losses']: if lobj['object_name'] in mz_specialized_losses: loss.append(rank_losses.get(lobj['object_name'])(lobj['object_params'])) else: loss.append(rank_losses.get(lobj['object_name'])) eval_metrics = OrderedDict() for mobj in config['metrics']: mobj = mobj.lower() if '@' in mobj: mt_key, mt_val = mobj.split('@', 1) eval_metrics[mobj] = metrics.get(mt_key)(int(mt_val)) else: eval_metrics[mobj] = metrics.get(mobj) model.compile(optimizer=optimizer, loss=loss) print('[Model] Model Compile Done.', end='\n') for i_e in range(num_iters): for tag, generator in train_gen.items(): genfun = generator.get_batch_generator() print('[%s]\t[Train:%s] ' % (time.strftime('%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag), end='') history = model.fit_generator( genfun, steps_per_epoch = display_interval, epochs = 1, shuffle=False, verbose = 0 ) #callbacks=[eval_map]) print('Iter:%d\tloss=%.6f' % (i_e, history.history['loss'][0]), end='\n') for tag, generator in eval_gen.items(): genfun = generator.get_batch_generator() print('[%s]\t[Eval:%s] ' % (time.strftime('%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag), end='') res = dict([[k,0.] for k in eval_metrics.keys()]) num_valid = 0 for input_data, y_true in genfun: y_pred = model.predict(input_data, batch_size=len(y_true)) if issubclass(type(generator), inputs.list_generator.ListBasicGenerator): list_counts = input_data['list_counts'] for k, eval_func in eval_metrics.items(): for lc_idx in range(len(list_counts)-1): pre = list_counts[lc_idx] suf = list_counts[lc_idx+1] res[k] += eval_func(y_true = y_true[pre:suf], y_pred = y_pred[pre:suf]) num_valid += len(list_counts) - 1 else: for k, eval_func in eval_metrics.items(): res[k] += eval_func(y_true = y_true, y_pred = y_pred) num_valid += 1 generator.reset() print('Iter:%d\t%s' % (i_e, '\t'.join(['%s=%f'%(k,v/num_valid) for k, v in res.items()])), end='\n') sys.stdout.flush() if (i_e+1) % save_weights_iters == 0: model.save_weights(weights_file % (i_e+1))
def predict(config): ######## Read input config ######## print(json.dumps(config, indent=2), end='\n') input_conf = config['inputs'] share_input_conf = input_conf['share'] # collect embedding if 'embed_path' in share_input_conf: embed_dict = read_embedding(filename=share_input_conf['embed_path']) _PAD_ = share_input_conf['vocab_size'] - 1 embed_dict[_PAD_] = np.zeros((share_input_conf['embed_size'], ), dtype=np.float32) embed = np.float32(np.random.uniform(-0.02, 0.02, [share_input_conf['vocab_size'], share_input_conf['embed_size']])) share_input_conf['embed'] = convert_embed_2_numpy(embed_dict, embed = embed) else: embed = np.float32(np.random.uniform(-0.2, 0.2, [share_input_conf['vocab_size'], share_input_conf['embed_size']])) share_input_conf['embed'] = embed print('[Embedding] Embedding Load Done.', end='\n') # list all input tags and construct tags config input_predict_conf = OrderedDict() for tag in input_conf.keys(): if 'phase' not in input_conf[tag]: continue if input_conf[tag]['phase'] == 'PREDICT': input_predict_conf[tag] = {} input_predict_conf[tag].update(share_input_conf) input_predict_conf[tag].update(input_conf[tag]) print('[Input] Process Input Tags. %s in PREDICT.' % (input_predict_conf.keys()), end='\n') # collect dataset identification dataset = {} for tag in input_conf: if tag == 'share' or input_conf[tag]['phase'] == 'PREDICT': if 'text1_corpus' in input_conf[tag]: datapath = input_conf[tag]['text1_corpus'] if datapath not in dataset: dataset[datapath], _ = read_data(datapath) if 'text2_corpus' in input_conf[tag]: datapath = input_conf[tag]['text2_corpus'] if datapath not in dataset: dataset[datapath], _ = read_data(datapath) print('[Dataset] %s Dataset Load Done.' % len(dataset), end='\n') # initial data generator predict_gen = OrderedDict() for tag, conf in input_predict_conf.items(): print(conf, end='\n') conf['data1'] = dataset[conf['text1_corpus']] conf['data2'] = dataset[conf['text2_corpus']] generator = inputs.get(conf['input_type']) predict_gen[tag] = generator( #data1 = dataset[conf['text1_corpus']], #data2 = dataset[conf['text2_corpus']], config = conf ) ######## Read output config ######## output_conf = config['outputs'] ######## Load Model ######## global_conf = config["global"] weights_file = str(global_conf['weights_file']) + '.' + str(global_conf['test_weights_iters']) model = load_model(config) model.load_weights(weights_file) eval_metrics = OrderedDict() for mobj in config['metrics']: mobj = mobj.lower() if '@' in mobj: mt_key, mt_val = mobj.split('@', 1) eval_metrics[mobj] = metrics.get(mt_key)(int(mt_val)) else: eval_metrics[mobj] = metrics.get(mobj) res = dict([[k,0.] for k in eval_metrics.keys()]) for tag, generator in predict_gen.items(): genfun = generator.get_batch_generator() print('[%s]\t[Predict] @ %s ' % (time.strftime('%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag), end='') num_valid = 0 res_scores = {} for input_data, y_true in genfun: y_pred = model.predict(input_data, batch_size=len(y_true) ) if issubclass(type(generator), inputs.list_generator.ListBasicGenerator): list_counts = input_data['list_counts'] for k, eval_func in eval_metrics.items(): for lc_idx in range(len(list_counts)-1): pre = list_counts[lc_idx] suf = list_counts[lc_idx+1] res[k] += eval_func(y_true = y_true[pre:suf], y_pred = y_pred[pre:suf]) y_pred = np.squeeze(y_pred) for lc_idx in range(len(list_counts)-1): pre = list_counts[lc_idx] suf = list_counts[lc_idx+1] for p, y, t in zip(input_data['ID'][pre:suf], y_pred[pre:suf], y_true[pre:suf]): if p[0] not in res_scores: res_scores[p[0]] = {} res_scores[p[0]][p[1]] = (y, t) num_valid += len(list_counts) - 1 else: for k, eval_func in eval_metrics.items(): res[k] += eval_func(y_true = y_true, y_pred = y_pred) for p, y, t in zip(input_data['ID'], y_pred, y_true): if p[0] not in res_scores: res_scores[p[0]] = {} res_scores[p[0]][p[1]] = (y[1], t[1]) num_valid += 1 generator.reset() if tag in output_conf: if output_conf[tag]['save_format'] == 'TREC': with open(output_conf[tag]['save_path'], 'w') as f: for qid, dinfo in res_scores.items(): dinfo = sorted(dinfo.items(), key=lambda d:d[1][0], reverse=True) for inum,(did, (score, gt)) in enumerate(dinfo): f.write('%s\tQ0\t%s\t%d\t%f\t%s\t%s\n'%(qid, did, inum, score, config['net_name'], gt)) elif output_conf[tag]['save_format'] == 'TEXTNET': with open(output_conf[tag]['save_path'], 'w') as f: for qid, dinfo in res_scores.items(): dinfo = sorted(dinfo.items(), key=lambda d:d[1][0], reverse=True) for inum,(did, (score, gt)) in enumerate(dinfo): f.write('%s %s %s %s\n'%(gt, qid, did, score)) print('[Predict] results: ', '\t'.join(['%s=%f'%(k,v/num_valid) for k, v in res.items()]), end='\n') sys.stdout.flush()
def bro_metrics_json(): return cors_jsonify(**metrics.get())
def bro_metrics(): return render_template('metrics.html', metrics=metrics.get())
def train(config): #json.dumps()用于将dict类型的数据转成str,因为如果直接将dict类型的数据写入json文件中会发生报错,因此在将数据写入时需要用到该函数 print(json.dumps(config, indent=2), end='\n') # read basic config global_conf = config["global"] optimizer = global_conf['optimizer'] optimizer = optimizers.get(optimizer) #总是调用keras.optimizers K.set_value( optimizer.lr, global_conf['learning_rate']) #使用 Numpy 数组设置变量的值. lr:learning rate weights_file = str(global_conf['weights_file']) + '.%d' display_interval = int(global_conf['display_interval']) num_iters = int(global_conf['num_iters']) save_weights_iters = int(global_conf['save_weights_iters']) # read input config input_conf = config['inputs'] share_input_conf = input_conf['share'] # collect embedding if 'embed_path' in share_input_conf: embed_dict = read_embedding(filename=share_input_conf['embed_path']) _PAD_ = share_input_conf['vocab_size'] - 1 embed_dict[_PAD_] = np.zeros((share_input_conf['embed_size'], ), dtype=np.float32) embed = np.float32( np.random.uniform(-0.2, 0.2, [ share_input_conf['vocab_size'], share_input_conf['embed_size'] ])) share_input_conf['embed'] = convert_embed_2_numpy(embed_dict, embed=embed) else: embed = np.float32( np.random.uniform(-0.2, 0.2, [ share_input_conf['vocab_size'], share_input_conf['embed_size'] ])) share_input_conf['embed'] = embed print('[Embedding] Embedding Load Done.', end='\n') # list all input tags and construct tags config input_train_conf = OrderedDict() #使用OrderedDict会根据放入元素的先后顺序进行排序 input_eval_conf = OrderedDict() for tag in input_conf.keys(): if 'phase' not in input_conf[tag]: continue if input_conf[tag]['phase'] == 'TRAIN': input_train_conf[tag] = {} input_train_conf[tag].update(share_input_conf) input_train_conf[tag].update(input_conf[tag]) elif input_conf[tag]['phase'] == 'EVAL': input_eval_conf[tag] = {} input_eval_conf[tag].update(share_input_conf) input_eval_conf[tag].update(input_conf[tag]) print( '[Input] Process Input Tags. %s in TRAIN, %s in EVAL.' % (input_train_conf.keys(), input_eval_conf.keys()), end='\n' ) #odict_keys(['train']) in TRAIN, odict_keys(['valid', 'test']) in EVAL. # collect dataset identification dataset = {} for tag in input_conf: if tag != 'share' and input_conf[tag]['phase'] == 'PREDICT': continue if 'text1_corpus' in input_conf[tag]: datapath = input_conf[tag]['text1_corpus'] if datapath not in dataset: dataset[datapath], _ = read_data(datapath) if 'text2_corpus' in input_conf[tag]: datapath = input_conf[tag]['text2_corpus'] if datapath not in dataset: dataset[datapath], _ = read_data(datapath) print('[Dataset] %s Dataset Load Done.' % len(dataset), end='\n') # initial data generator train_gen = OrderedDict() eval_gen = OrderedDict() for tag, conf in input_train_conf.items(): print(conf, end='\n') conf['data1'] = dataset[conf['text1_corpus']] conf['data2'] = dataset[conf['text2_corpus']] generator = inputs.get(conf['input_type']) train_gen[tag] = generator(config=conf) for tag, conf in input_eval_conf.items(): print(conf, end='\n') conf['data1'] = dataset[conf['text1_corpus']] conf['data2'] = dataset[conf['text2_corpus']] generator = inputs.get(conf['input_type']) eval_gen[tag] = generator(config=conf) ######### Load Model ######### model = load_model(config) loss = [] for lobj in config['losses']: if lobj['object_name'] in mz_specialized_losses: loss.append( rank_losses.get(lobj['object_name'])(lobj['object_params'])) else: loss.append(rank_losses.get(lobj['object_name'])) eval_metrics = OrderedDict() for mobj in config['metrics']: mobj = mobj.lower() if '@' in mobj: mt_key, mt_val = mobj.split('@', 1) eval_metrics[mobj] = metrics.get(mt_key)(int(mt_val)) else: eval_metrics[mobj] = metrics.get(mobj) model.compile(optimizer=optimizer, loss=loss) print('[Model] Model Compile Done.', end='\n') #add tensorboard check # board = keras.callbacks.TensorBoard(log_dir='../data/toy_example/logs', histogram_freq=0) # history = LossHistory() for i_e in range(num_iters): #num_iters类似epochs for tag, generator in train_gen.items(): #genfun生成器生成batch_size*2个样本(一半正样本,一半负样本) #display_interval = len(pair_list)//(batch_size*2) genfun = generator.get_batch_generator() print('[%s]\t[Train:%s] ' % (time.strftime( '%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag), end='') history = model.fit_generator( genfun, steps_per_epoch=display_interval, epochs=1, shuffle=False, verbose=0, #callbacks = [history, board] ) #callbacks=[eval_map]) print('Iter:%d\tloss=%.6f' % (i_e, history.history['loss'][0]), end='\n') for tag, generator in eval_gen.items(): genfun = generator.get_batch_generator() print('[%s]\t[Eval:%s] ' % (time.strftime( '%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag), end='') res = dict([[k, 0.] for k in eval_metrics.keys()]) num_valid = 0 for input_data, y_true in genfun: y_pred = model.predict(input_data, batch_size=len(y_true)) if issubclass(type(generator), inputs.list_generator.ListBasicGenerator): list_counts = input_data['list_counts'] for k, eval_func in eval_metrics.items(): for lc_idx in range(len(list_counts) - 1): pre = list_counts[lc_idx] suf = list_counts[lc_idx + 1] res[k] += eval_func(y_true=y_true[pre:suf], y_pred=y_pred[pre:suf]) num_valid += len(list_counts) - 1 else: for k, eval_func in eval_metrics.items(): res[k] += eval_func(y_true=y_true, y_pred=y_pred) num_valid += 1 generator.reset() print('Iter:%d\t%s' % (i_e, '\t'.join( ['%s=%f' % (k, v / num_valid) for k, v in res.items()])), end='\n') sys.stdout.flush() if (i_e + 1) % save_weights_iters == 0: model.save_weights(weights_file % (i_e + 1))
def predict(config): ######## Read input config ######## print(json.dumps(config, indent=2)) input_conf = config['inputs'] share_input_conf = input_conf['share'] # collect embedding if 'embed_path' in share_input_conf: embed_dict = read_embedding(filename=share_input_conf['embed_path']) _PAD_ = share_input_conf['fill_word'] embed_dict[_PAD_] = np.zeros((share_input_conf['embed_size'], ), dtype=np.float32) embed = np.float32( np.random.uniform(-0.02, 0.02, [ share_input_conf['vocab_size'], share_input_conf['embed_size'] ])) share_input_conf['embed'] = convert_embed_2_numpy(embed_dict, embed=embed) else: embed = np.float32( np.random.uniform(-0.2, 0.2, [ share_input_conf['vocab_size'], share_input_conf['embed_size'] ])) share_input_conf['embed'] = embed print '[Embedding] Embedding Load Done.' # list all input tags and construct tags config input_predict_conf = OrderedDict() for tag in input_conf.keys(): if 'phase' not in input_conf[tag]: continue if input_conf[tag]['phase'] == 'PREDICT': input_predict_conf[tag] = {} input_predict_conf[tag].update(share_input_conf) input_predict_conf[tag].update(input_conf[tag]) print '[Input] Process Input Tags. %s in PREDICT.' % ( input_predict_conf.keys()) # collect dataset identification dataset = {} for tag in input_conf: if tag == 'share' or input_conf[tag]['phase'] == 'PREDICT': if 'text1_corpus' in input_conf[tag]: datapath = input_conf[tag]['text1_corpus'] if datapath not in dataset: dataset[datapath], _ = read_data(datapath) if 'text2_corpus' in input_conf[tag]: datapath = input_conf[tag]['text2_corpus'] if datapath not in dataset: dataset[datapath], _ = read_data(datapath) print '[Dataset] %s Dataset Load Done.' % len(dataset) # initial data generator predict_gen = OrderedDict() for tag, conf in input_predict_conf.items(): print conf conf['data1'] = dataset[conf['text1_corpus']] conf['data2'] = dataset[conf['text2_corpus']] generator = inputs.get(conf['input_type']) predict_gen[tag] = generator( #data1 = dataset[conf['text1_corpus']], #data2 = dataset[conf['text2_corpus']], config=conf) ######## Read output config ######## output_conf = config['outputs'] ######## Load Model ######## global_conf = config["global"] weights_file = global_conf['weights_file'] model = load_model(config) model.load_weights(weights_file) eval_metrics = OrderedDict() for mobj in config['metrics']: mobj = mobj.lower() if '@' in mobj: mt_key, mt_val = mobj.split('@', 1) eval_metrics[mobj] = metrics.get(mt_key)(int(mt_val)) else: eval_metrics[mobj] = metrics.get(mobj) res = dict([[k, 0.] for k in eval_metrics.keys()]) for tag, generator in predict_gen.items(): genfun = generator.get_batch_generator() print '[Predict] @ %s ' % tag, num_valid = 0 res_scores = {} for input_data, y_true in genfun: list_counts = input_data['list_counts'] y_pred = model.predict(input_data, batch_size=len(y_true)) for k, eval_func in eval_metrics.items(): for lc_idx in range(len(list_counts) - 1): pre = list_counts[lc_idx] suf = list_counts[lc_idx + 1] res[k] += eval_func(y_true=y_true[pre:suf], y_pred=y_pred[pre:suf]) y_pred = np.squeeze(y_pred) for lc_idx in range(len(list_counts) - 1): pre = list_counts[lc_idx] suf = list_counts[lc_idx + 1] for p, y, t in zip(input_data['ID'][pre:suf], y_pred[pre:suf], y_true[pre:suf]): if p[0] not in res_scores: res_scores[p[0]] = {} res_scores[p[0]][p[1]] = (y, t) num_valid += len(list_counts) - 1 generator.reset() if tag in output_conf: if output_conf[tag]['save_format'] == 'TREC': with open(output_conf[tag]['save_path'], 'w') as f: for qid, dinfo in res_scores.items(): dinfo = sorted(dinfo.items(), key=lambda d: d[1][0], reverse=True) for inum, (did, (score, gt)) in enumerate(dinfo): print >> f, '%s\tQ0\t%s\t%d\t%f\t%s' % ( qid, did, inum, score, config['net_name']) elif output_conf[tag]['save_format'] == 'TEXTNET': with open(output_conf[tag]['save_path'], 'w') as f: for qid, dinfo in res_scores.items(): dinfo = sorted(dinfo.items(), key=lambda d: d[1][0], reverse=True) for inum, (did, (score, gt)) in enumerate(dinfo): print >> f, '%s %s %s %s' % (gt, qid, did, score) print '[Predict] results: ', ' '.join( ['%s:%f' % (k, v / num_valid) for k, v in res.items()]) sys.stdout.flush()
def __init__(self, model_path, batch_size, epoch_num, lr, keep_rate, seq_len=None, net_params=None, origin_file=None, files_split=None, match_model="bert", vec_models=None, engine=None, recall_num=5, eval_metrics=None): ''' :param model_path: 模型存放路径 :param origin_file: 原始文本-类别文件/问法-问题 :param files_split: 处理好的训练集和测试集 :param match_model: 排序/匹配模型 :param vec_models: 特征提取器(list,可插拔) :param engine: 检索引擎 :param eval_metrics: 评价指标(list,可插拔) ''' self.r_state = 666 assert isinstance(vec_models, list) and len(vec_models) > 0 if match_model not in nets_dict: raise ValueError("valid model must in: {}".format(" ".join( [k for k in nets_dict.keys()]))) self.model_name = match_model self.model_path = model_path.format(m=match_model) if not os.path.exists(self.model_path): os.mkdir(self.model_path) self.max_models_num = 7 self.net_params = net_params self.ignore_std_queries = ["others"] self.featurizers = [] for m in vec_models: if isinstance(m, list): self.featurizers += m else: self.featurizers.append(m) self.engine = engine if eval_metrics: assert isinstance(eval_metrics, list) and len(eval_metrics) > 0 self.eval_metrics = OrderedDict() for mobj in eval_metrics: mobj = mobj.lower() if '@' in mobj: mt_key, mt_val = mobj.split('@', 1) self.eval_metrics[mobj] = metrics.get(mt_key)(int(mt_val)) else: self.eval_metrics[mobj] = metrics.get(mobj) self.origin_file, self.files_split = origin_file, files_split self.sep = "\t" self.query_col, self.label_col = "question", "qid" self.train_rate = 0.7 # 训练测试数据分布 self.key_cols = [self.label_col, self.query_col] self.per_docs_num = recall_num self.input_cols = ["text_{}".format(i) for i in range(2)] self.num_feed_x = len(self.featurizers) * len(self.input_cols) self.model_col = "label" self.sample_dist = 1.0 # 训练集标签分布 self.cols = self.input_cols + [self.model_col] self.query2qid, self.train_num, self.eva_num = None, None, None self.seq_len = seq_len self.num_class = 2 self.keep_rate = keep_rate self.batch_size = batch_size self.epoch_num = epoch_num self.lr = lr self.tf_dtypes, self.np_dtypes = [], [] for m in self.featurizers: self.tf_dtypes += [ tf.float32 if len(m.output_shape) > 2 else tf.int32 ] * len(self.input_cols) self.np_dtypes += [ np.float32 if len(m.output_shape) > 2 else np.int32 ] * len(self.input_cols) self.tf_dtypes.append(tf.int32) self.np_dtypes.append(np.int32) self.input_tensors = [] self.test_y, self.keep_prob = None, None self.define_tensor() self.net_loss, self.one_hot_labels, self.pred_prob = None, None, None self.net_init() self.session = None model_save = tf.train.get_checkpoint_state(self.model_path) if model_save and model_save.model_checkpoint_path: print("Loading matching model...") # tf.reset_default_graph() self.session = tf.Session() self.session.run(tf.global_variables_initializer()) saver = tf.train.Saver() try: saver.restore(self.session, model_save.model_checkpoint_path) print("Rank model is ready") except: print("Load rank model Failed !!") else: print("Rank model not exists")
def predict(config): ######## Read input config ######## print(json.dumps(config, indent=2)) input_conf = config['inputs'] share_input_conf = input_conf['share'] # collect embedding if 'embed_path' in share_input_conf: embed_dict = read_embedding(filename=share_input_conf['embed_path']) _PAD_ = share_input_conf['vocab_size'] - 1 embed_dict[_PAD_] = np.zeros((share_input_conf['embed_size'], ), dtype=np.float32) embed = np.float32(np.random.uniform(-0.02, 0.02, [share_input_conf['vocab_size'], share_input_conf['embed_size']])) share_input_conf['embed'] = convert_embed_2_numpy(embed_dict, embed = embed) else: embed = np.float32(np.random.uniform(-0.2, 0.2, [share_input_conf['vocab_size'], share_input_conf['embed_size']])) share_input_conf['embed'] = embed print '[Embedding] Embedding Load Done.' # list all input tags and construct tags config input_predict_conf = OrderedDict() for tag in input_conf.keys(): if 'phase' not in input_conf[tag]: continue if input_conf[tag]['phase'] == 'PREDICT': input_predict_conf[tag] = {} input_predict_conf[tag].update(share_input_conf) input_predict_conf[tag].update(input_conf[tag]) print '[Input] Process Input Tags. %s in PREDICT.' % (input_predict_conf.keys()) # collect dataset identification dataset = {} for tag in input_conf: if tag == 'share' or input_conf[tag]['phase'] == 'PREDICT': if 'text1_corpus' in input_conf[tag]: datapath = input_conf[tag]['text1_corpus'] if datapath not in dataset: dataset[datapath] = read_data_2d(datapath) if 'text2_corpus' in input_conf[tag]: datapath = input_conf[tag]['text2_corpus'] if datapath not in dataset: dataset[datapath] = read_data_2d(datapath) if 'qa_comat_file' in input_conf[tag]: # qa_comat_file for qa_cooccur_matrix in DMN_KD_CQA and DMN_KD_Web datapath = input_conf[tag]['qa_comat_file'] if datapath not in dataset: dataset[datapath] = read_qa_comat(datapath) print '[Dataset] %s Dataset Load Done.' % len(dataset) # initial data generator predict_gen = OrderedDict() for tag, conf in input_predict_conf.items(): print conf conf['data1'] = dataset[conf['text1_corpus']] conf['data2'] = dataset[conf['text2_corpus']] if 'qa_comat_file' in share_input_conf: conf['qa_comat'] = dataset[conf['qa_comat_file']] generator = inputs.get(conf['input_type']) predict_gen[tag] = generator( #data1 = dataset[conf['text1_corpus']], #data2 = dataset[conf['text2_corpus']], config = conf ) ######## Read output config ######## output_conf = config['outputs'] ######## Load Model ######## global_conf = config["global"] weights_file = str(global_conf['weights_file']) + '.' + str(global_conf['test_weights_iters']) + '-' + str(seed) if config['net_name'] == 'DMN_CNN_MTL': model, model_clf = load_model(config) model.load_weights(weights_file) elif config['net_name'] == 'DMN_CNN_INTENTS': model_clf = load_model(config) model_clf.load_weights(weights_file) elif config['net_name'] == 'DMN_CNN_MTL_Web' or config['net_name'] == 'DMN_CNN_MTL_Web_v2': model, model_web = load_model(config) model.load_weights(weights_file) weights_file_web = str(global_conf['weights_file_web']) + '.' + str(global_conf['test_weights_iters']) + '-' + str(seed) model_web.load_weights(weights_file_web) elif config['net_name'] == 'DMN_CNN_MTL_All': model, model_web, model_clf = load_model(config) model.load_weights(weights_file) weights_file_web = str(global_conf['weights_file_web']) + '.' + str( global_conf['test_weights_iters']) + '-' + str(seed) model_web.load_weights(weights_file_web) else: model = load_model(config) model.load_weights(weights_file) eval_metrics = OrderedDict() for mobj in config['metrics']: mobj = mobj.lower() if '@' in mobj: mt_key, mt_val = mobj.split('@', 1) eval_metrics[mobj] = metrics.get(mt_key)(int(mt_val)) else: eval_metrics[mobj] = metrics.get(mobj) res = dict([[k,0.] for k in eval_metrics.keys()]) for tag, generator in predict_gen.items(): genfun = generator.get_batch_generator() print '[%s]\t[Predict] @ %s ' % (time.strftime('%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag), num_valid = 0 res_scores = {} if tag == 'predict': model_to_evaluate = model elif tag == 'predict_clf': model_to_evaluate = model_clf elif tag == 'predict_web': model_to_evaluate = model_web for input_data, y_true in genfun: y_pred = model_to_evaluate.predict(input_data, batch_size=len(y_true)) if tag == 'predict_clf': y_pred = np.argmax(y_pred, axis=1) if issubclass(type(generator), inputs.list_generator.ListBasicGenerator): list_counts = input_data['list_counts'] for k, eval_func in eval_metrics.items(): for lc_idx in range(len(list_counts)-1): pre = list_counts[lc_idx] suf = list_counts[lc_idx+1] res[k] += eval_func(y_true = y_true[pre:suf], y_pred = y_pred[pre:suf]) y_pred = np.squeeze(y_pred) for lc_idx in range(len(list_counts) - 1): pre = list_counts[lc_idx] suf = list_counts[lc_idx + 1] for p, y, t in zip(input_data['ID'][pre:suf], y_pred[pre:suf], y_true[pre:suf]): if tag == 'predict_clf': res_scores[p[0]] = (y, t) else: if p[0] not in res_scores: res_scores[p[0]] = {} res_scores[p[0]][p[1]] = (y, t) num_valid += len(list_counts) - 1 else: for k, eval_func in eval_metrics.items(): res[k] += eval_func(y_true = y_true, y_pred = y_pred) for p, y, t in zip(input_data['ID'], y_pred, y_true): if p[0] not in res_scores: res_scores[p[0]] = {} res_scores[p[0]][p[1]] = (y[1], t[1]) num_valid += 1 generator.reset() if tag in output_conf: save_path = output_conf[tag]['save_path'] + '-' + str(seed) if output_conf[tag]['save_format'] == 'TREC': with open(save_path, 'w') as f: if tag == 'predict_clf': for qid, entry in res_scores.items(): print >> f, '%s\t%d\t%d'%(qid, entry[0], entry[1]) else: for qid, dinfo in res_scores.items(): dinfo = sorted(dinfo.items(), key=lambda d:d[1][0], reverse=True) for inum,(did, (score, gt)) in enumerate(dinfo): print >> f, '%s\tQ0\t%s\t%d\t%f\t%s\t%s'%(qid, did, inum, score, config['net_name'], gt) elif output_conf[tag]['save_format'] == 'TEXTNET': with open(save_path, 'w') as f: for qid, dinfo in res_scores.items(): dinfo = sorted(dinfo.items(), key=lambda d:d[1][0], reverse=True) for inum,(did, (score, gt)) in enumerate(dinfo): print >> f, '%s %s %s %s'%(gt, qid, did, score) print '[Predict] results: ', '\t'.join(['%s=%f'%(k,v/num_valid) for k, v in res.items()]) sys.stdout.flush()
def train(config): print(json.dumps(config, indent=2)) # read basic config global_conf = config["global"] optimizer = global_conf['optimizer'] weights_file = str(global_conf['weights_file']) + '.%d' display_interval = int(global_conf['display_interval']) num_iters = int(global_conf['num_iters']) save_weights_iters = int(global_conf['save_weights_iters']) # read input config input_conf = config['inputs'] share_input_conf = input_conf['share'] # collect embedding if 'embed_path' in share_input_conf: embed_dict = read_embedding(filename=share_input_conf['embed_path']) _PAD_ = share_input_conf['vocab_size'] - 1 embed_dict[_PAD_] = np.zeros((share_input_conf['embed_size'], ), dtype=np.float32) embed = np.float32( np.random.uniform(-0.2, 0.2, [ share_input_conf['vocab_size'], share_input_conf['embed_size'] ])) share_input_conf['embed'] = convert_embed_2_numpy(embed_dict, embed=embed) else: embed = np.float32( np.random.uniform(-0.2, 0.2, [ share_input_conf['vocab_size'], share_input_conf['embed_size'] ])) share_input_conf['embed'] = embed print '[Embedding] Embedding Load Done.' # list all input tags and construct tags config input_train_conf = OrderedDict() input_eval_conf = OrderedDict() for tag in input_conf.keys(): if 'phase' not in input_conf[tag]: continue if input_conf[tag]['phase'] == 'TRAIN': input_train_conf[tag] = {} input_train_conf[tag].update(share_input_conf) input_train_conf[tag].update(input_conf[tag]) elif input_conf[tag]['phase'] == 'EVAL': input_eval_conf[tag] = {} input_eval_conf[tag].update(share_input_conf) input_eval_conf[tag].update(input_conf[tag]) print '[Input] Process Input Tags. %s in TRAIN, %s in EVAL.' % ( input_train_conf.keys(), input_eval_conf.keys()) # collect dataset identification dataset = {} for tag in input_conf: if tag != 'share' and input_conf[tag]['phase'] == 'PREDICT': continue if 'text1_corpus' in input_conf[tag]: datapath = input_conf[tag]['text1_corpus'] if datapath not in dataset: dataset[datapath], _ = read_data(datapath) if 'text2_corpus' in input_conf[tag]: datapath = input_conf[tag]['text2_corpus'] if datapath not in dataset: dataset[datapath], _ = read_data(datapath) print '[Dataset] %s Dataset Load Done.' % len(dataset) # initial data generator train_gen = OrderedDict() eval_gen = OrderedDict() for tag, conf in input_train_conf.items(): print conf conf['data1'] = dataset[conf['text1_corpus']] conf['data2'] = dataset[conf['text2_corpus']] generator = inputs.get(conf['input_type']) train_gen[tag] = generator(config=conf) for tag, conf in input_eval_conf.items(): print conf conf['data1'] = dataset[conf['text1_corpus']] conf['data2'] = dataset[conf['text2_corpus']] generator = inputs.get(conf['input_type']) eval_gen[tag] = generator(config=conf) ######### Load Model ######### model = load_model(config) loss = [] for lobj in config['losses']: if lobj['object_name'] in mz_specialized_losses: loss.append( rank_losses.get(lobj['object_name'])(lobj['object_params'])) else: loss.append(rank_losses.get(lobj['object_name'])) eval_metrics = OrderedDict() for mobj in config['metrics']: mobj = mobj.lower() if '@' in mobj: mt_key, mt_val = mobj.split('@', 1) eval_metrics[mobj] = metrics.get(mt_key)(int(mt_val)) else: eval_metrics[mobj] = metrics.get(mobj) model.compile(optimizer=optimizer, loss=loss) print '[Model] Model Compile Done.' for i_e in range(num_iters): for tag, generator in train_gen.items(): genfun = generator.get_batch_generator() print '[%s]\t[Train:%s]' % (time.strftime( '%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag), history = model.fit_generator(genfun, steps_per_epoch=display_interval, epochs=1, shuffle=False, verbose=0) #callbacks=[eval_map]) print 'Iter:%d\tloss=%.6f' % (i_e, history.history['loss'][0]) for tag, generator in eval_gen.items(): genfun = generator.get_batch_generator() print '[%s]\t[Eval:%s]' % (time.strftime( '%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag), res = dict([[k, 0.] for k in eval_metrics.keys()]) num_valid = 0 for input_data, y_true in genfun: y_pred = model.predict(input_data, batch_size=len(y_true)) if issubclass(type(generator), inputs.list_generator.ListBasicGenerator): list_counts = input_data['list_counts'] for k, eval_func in eval_metrics.items(): for lc_idx in range(len(list_counts) - 1): pre = list_counts[lc_idx] suf = list_counts[lc_idx + 1] res[k] += eval_func(y_true=y_true[pre:suf], y_pred=y_pred[pre:suf]) num_valid += len(list_counts) - 1 else: for k, eval_func in eval_metrics.items(): res[k] += eval_func(y_true=y_true, y_pred=y_pred) num_valid += 1 generator.reset() print 'Iter:%d\t%s' % (i_e, '\t'.join( ['%s=%f' % (k, v / num_valid) for k, v in res.items()])) sys.stdout.flush() if (i_e + 1) % save_weights_iters == 0: model.save_weights(weights_file % (i_e + 1))
def train(config): if seed is None: raise Exception('Seed should be set') print('Using seed: ' + str(seed)) # read basic config global_conf = config["global"] learning_rate = global_conf['learning_rate'] use_existing_weights = global_conf['use_existing_weights'] if 'use_existing_weights' in global_conf else None optimizer = Adam(lr=learning_rate) weights_file = str(global_conf['weights_file']) + '.%d' weights_file_web = str(global_conf['weights_file_web']) + '.%d' if 'weights_file_web' in global_conf else None display_interval = int(global_conf['display_interval']) num_iters = int(global_conf['num_iters']) save_weights_iters = int(global_conf['save_weights_iters']) # read input config input_conf = config['inputs'] share_input_conf = input_conf['share'] # collect embedding if 'embed_path' in share_input_conf: embed_dict = read_embedding(filename=share_input_conf['embed_path']) _PAD_ = share_input_conf['vocab_size'] - 1 embed_dict[_PAD_] = np.zeros((share_input_conf['embed_size'], ), dtype=np.float32) embed = np.float32(np.random.uniform(-0.2, 0.2, [share_input_conf['vocab_size'], share_input_conf['embed_size']])) share_input_conf['embed'] = convert_embed_2_numpy(embed_dict, embed = embed) else: embed = np.float32(np.random.uniform(-0.2, 0.2, [share_input_conf['vocab_size'], share_input_conf['embed_size']])) share_input_conf['embed'] = embed print '[Embedding] Embedding Load Done.' # list all input tags and construct tags config input_train_conf = OrderedDict() input_eval_conf = OrderedDict() for tag in input_conf.keys(): if 'phase' not in input_conf[tag]: continue if input_conf[tag]['phase'] == 'TRAIN': input_train_conf[tag] = {} input_train_conf[tag].update(share_input_conf) input_train_conf[tag].update(input_conf[tag]) elif input_conf[tag]['phase'] == 'EVAL': input_eval_conf[tag] = {} input_eval_conf[tag].update(share_input_conf) input_eval_conf[tag].update(input_conf[tag]) # collect dataset identification dataset = {} for tag in input_conf: if tag != 'share' and input_conf[tag]['phase'] == 'PREDICT': continue if 'text1_corpus' in input_conf[tag]: datapath = input_conf[tag]['text1_corpus'] if datapath not in dataset: dataset[datapath] = read_data_2d(datapath) if 'text2_corpus' in input_conf[tag]: datapath = input_conf[tag]['text2_corpus'] if datapath not in dataset: dataset[datapath] = read_data_2d(datapath) if 'qa_comat_file' in input_conf[tag]: # qa_comat_file for qa_cooccur_matrix in DMN_KD datapath = input_conf[tag]['qa_comat_file'] if datapath not in dataset: dataset[datapath] = read_qa_comat(datapath) print '[Dataset] %s Dataset Load Done.' % len(dataset) # initial data generator train_gen = OrderedDict() eval_gen = OrderedDict() for tag, conf in input_train_conf.items(): print conf conf['data1'] = dataset[conf['text1_corpus']] conf['data2'] = dataset[conf['text2_corpus']] if 'qa_comat_file' in share_input_conf: conf['qa_comat'] = dataset[conf['qa_comat_file']] generator = inputs.get(conf['input_type']) train_gen[tag] = generator(config=conf) for tag, conf in input_eval_conf.items(): print conf conf['data1'] = dataset[conf['text1_corpus']] conf['data2'] = dataset[conf['text2_corpus']] if 'qa_comat_file' in share_input_conf: conf['qa_comat'] = dataset[conf['qa_comat_file']] generator = inputs.get(conf['input_type']) eval_gen[tag] = generator(config=conf) ######### Load Model ######### if config['net_name'] == 'DMN_CNN_MTL': model, model_clf = load_model(config) if use_existing_weights: weights_file_to_load = str(global_conf['weights_file']) + '.' + str( global_conf['weights_to_load']) + '-' + str(seed) model.load_weights(weights_file_to_load) model_clf.compile(optimizer=optimizer, loss=custom_loss) print '[Model] MTL models Compile Done.' elif config['net_name'] == 'DMN_CNN_INTENTS': model_clf = load_model(config) model_clf.compile(optimizer=optimizer, loss=custom_loss) print '[Model] Intent Only classifier model Compile Done.' elif config['net_name'] == 'DMN_CNN_MTL_Web' or config['net_name'] == 'DMN_CNN_MTL_Web_v2': model, model_web = load_model(config) if use_existing_weights: weights_file_to_load = str(global_conf['weights_file']) + '.' + str( global_conf['weights_to_load']) + '-' + str(seed) model.load_weights(weights_file_to_load) weights_file_web = str(global_conf['weights_file_web']) + '.' + str( global_conf['weights_to_load']) + '-' + str(seed) model_web.load_weights(weights_file_web) elif config['net_name'] == 'DMN_CNN_MTL_All': model, model_web, model_clf = load_model(config) model_clf.compile(optimizer=optimizer, loss=custom_loss) else: model = load_model(config) if use_existing_weights: weights_file_to_load = str(global_conf['weights_file']) + '.' + str( global_conf['test_weights_iters']) + '-' + str(seed) model.load_weights(weights_file_to_load) print '[Model] Response Ranking model Compile Done.' eval_metrics = OrderedDict() for mobj in config['metrics']: mobj = mobj.lower() if '@' in mobj: mt_key, mt_val = mobj.split('@', 1) eval_metrics[mobj] = metrics.get(mt_key)(int(mt_val)) else: eval_metrics[mobj] = metrics.get(mobj) if config['net_name'] != 'DMN_CNN_INTENTS': loss = [] for lobj in config['losses']: if lobj['object_name'] in mz_specialized_losses: loss.append(rank_losses.get(lobj['object_name'])(lobj['object_params'])) else: loss.append(rank_losses.get(lobj['object_name'])) model.compile(optimizer=optimizer, loss=loss) print '[Model] Model Compile Done.' if config['net_name'] == 'DMN_CNN_MTL_Web' or config['net_name'] == 'DMN_CNN_MTL_Web_v2' \ or config['net_name'] == 'DMN_CNN_MTL_All': model_web.compile(optimizer=optimizer, loss=loss) print('[Model Web] Model Compile Done') if share_input_conf['predict'] == 'False': if 'test' in eval_gen: del eval_gen['test'] if 'valid' in eval_gen: del eval_gen['valid'] if 'eval_predict_in' in eval_gen: del eval_gen['eval_predict_in'] for i_e in range(num_iters): for tag, generator in train_gen.items(): genfun = generator.get_batch_generator() print '[%s]\t[Train:%s]' % (time.strftime('%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag), if tag == "train_clf": correct_model = model_clf elif tag == 'train_web': correct_model = model_web elif tag == "train": correct_model = model history = correct_model.fit_generator( genfun, steps_per_epoch=display_interval, # if display_interval = 10, then there are 10 batches in 1 epoch epochs=1, shuffle=False, verbose=0) print 'Iter:%d\tloss=%.6f' % (i_e, history.history['loss'][0]) if (i_e+1) % save_weights_iters == 0: for tag, generator in eval_gen.items(): print('Evaluating tag:' + str(tag)) genfun = generator.get_batch_generator() print '[%s]\t[Eval:%s]' % (time.strftime('%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag), res = dict([[k,0.] for k in eval_metrics.keys()]) num_valid = 0 if tag == "valid": correct_model = model elif tag == "valid_web": correct_model = model_web elif tag == "valid_clf": correct_model = model_clf for input_data, y_true in genfun: y_pred = correct_model.predict(input_data, batch_size=len(y_true)) if issubclass(type(generator), inputs.list_generator.ListBasicGenerator): list_counts = input_data['list_counts'] for k, eval_func in eval_metrics.items(): for lc_idx in range(len(list_counts)-1): pre = list_counts[lc_idx] suf = list_counts[lc_idx+1] res[k] += eval_func(y_true = y_true[pre:suf], y_pred = y_pred[pre:suf]) num_valid += len(list_counts) - 1 else: for k, eval_func in eval_metrics.items(): res[k] += eval_func(y_true = y_true, y_pred = y_pred) num_valid += 1 generator.reset() print 'Iter:%d\t%s' % (i_e, '\t'.join(['%s=%f'%(k,v/num_valid) for k, v in res.items()])) sys.stdout.flush() sys.stdout.flush() weights_file_name = (weights_file % (i_e+1)) + '-' + str(seed) if (i_e+1) % save_weights_iters == 0: if config['net_name'] == 'DMN_CNN_MTL_Web' or config['net_name'] == 'DMN_CNN_MTL_Web_v2' \ or config['net_name'] == 'DMN_CNN_MTL_All': weights_file_name_web = (weights_file_web % (i_e + 1)) + '-' + str(seed) model.save_weights(weights_file_name) model_web.save_weights(weights_file_name_web) elif config['net_name'] != 'DMN_CNN_INTENTS': model.save_weights(weights_file_name) else: model_clf.save_weights(weights_file_name)
def predict(config): ######## Read input config ######## print(json.dumps(config, indent=2), end='\n') input_conf = config['inputs'] share_input_conf = input_conf['share'] # collect embedding if 'embed_path' in share_input_conf: embed_dict = read_embedding(filename=share_input_conf['embed_path']) _PAD_ = share_input_conf['vocab_size'] - 1 embed_dict[_PAD_] = np.zeros((share_input_conf['embed_size'], ), dtype=np.float32) embed = np.float32( np.random.uniform(-0.02, 0.02, [ share_input_conf['vocab_size'], share_input_conf['embed_size'] ])) share_input_conf['embed'] = convert_embed_2_numpy(embed_dict, embed=embed) else: embed = np.float32( np.random.uniform(-0.2, 0.2, [ share_input_conf['vocab_size'], share_input_conf['embed_size'] ])) share_input_conf['embed'] = embed print('[Embedding] Embedding Load Done.', end='\n') # list all input tags and construct tags config input_predict_conf = OrderedDict() for tag in input_conf.keys(): if 'phase' not in input_conf[tag]: continue if input_conf[tag]['phase'] == 'PREDICT': input_predict_conf[tag] = {} input_predict_conf[tag].update(share_input_conf) input_predict_conf[tag].update(input_conf[tag]) print('[Input] Process Input Tags. %s in PREDICT.' % (input_predict_conf.keys()), end='\n') # collect dataset identification dataset = {} for tag in input_conf: if tag == 'share' or input_conf[tag]['phase'] == 'PREDICT': if 'text1_corpus' in input_conf[tag]: datapath = input_conf[tag]['text1_corpus'] if datapath not in dataset: dataset[datapath], _ = read_data(datapath) if 'text2_corpus' in input_conf[tag]: datapath = input_conf[tag]['text2_corpus'] if datapath not in dataset: dataset[datapath], _ = read_data(datapath) print('[Dataset] %s Dataset Load Done.' % len(dataset), end='\n') # initial data generator predict_gen = OrderedDict() for tag, conf in input_predict_conf.items(): print(conf, end='\n') conf['data1'] = dataset[conf['text1_corpus']] conf['data2'] = dataset[conf['text2_corpus']] generator = inputs.get(conf['input_type']) predict_gen[tag] = generator( #data1 = dataset[conf['text1_corpus']], #data2 = dataset[conf['text2_corpus']], config=conf) ######## Read output config ######## output_conf = config['outputs'] ######## Load Model ######## global_conf = config["global"] weights_file = str(global_conf['weights_file']) + '.' + str( global_conf['test_weights_iters']) zmodel, kmodel = load_model(config) # test y_pred from zoo model and keras model # keras2_y_pred = kmodel.predict(input_data, batch_size=batch_size) # y_pred = model.forward(input_data) # # y_pred = model.predict(input_data, distributed=False) # equal = np.allclose(y_pred, keras2_y_pred, rtol=1e-5, atol=1e-5) # print(equal) # return y_pred eval_metrics = OrderedDict() for mobj in config['metrics']: mobj = mobj.lower() if '@' in mobj: mt_key, mt_val = mobj.split('@', 1) eval_metrics[mobj] = metrics.get(mt_key)(int(mt_val)) else: eval_metrics[mobj] = metrics.get(mobj) res = dict([[k, 0.] for k in eval_metrics.keys()]) # batch_size = 20 # query_data = np.random.randint(0, 10000, [batch_size, 10]) # doc_data = np.random.randint(0, 10000, [batch_size, 40]) # input_data = [query_data, doc_data] # keras2_y_pred = keras2_model.predict(input_data, batch_size=batch_size) # y_pred = model.predict(input_data, distributed=False) # equal = np.allclose(y_pred, keras2_y_pred, rtol=1e-5, atol=1e-5) for tag, generator in predict_gen.items(): genfun = generator.get_batch_generator() print('[%s]\t[Predict] @ %s ' % (time.strftime( '%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag), end='') num_valid = 0 res_scores = {} for input_data, y_true in genfun: ky_pred = kmodel.predict(input_data, batch_size=len(y_true)) names = ['query', 'doc'] shapes = [(None, 10), (None, 40)] list_input_data = _standardize_input_data(input_data, names, shapes, check_batch_axis=False) # list_input_data = [data[0:2, :] for data in list_input_data] # y_pred = zmodel.predict(list_input_data, distributed=False) y_pred = zmodel.forward(list_input_data) equal = np.allclose(y_pred, ky_pred, rtol=1e-5, atol=1e-5) print(equal) if issubclass(type(generator), inputs.list_generator.ListBasicGenerator): list_counts = input_data['list_counts'] for k, eval_func in eval_metrics.items(): for lc_idx in range(len(list_counts) - 1): pre = list_counts[lc_idx] suf = list_counts[lc_idx + 1] res[k] += eval_func(y_true=y_true[pre:suf], y_pred=y_pred[pre:suf]) y_pred = np.squeeze(y_pred) for lc_idx in range(len(list_counts) - 1): pre = list_counts[lc_idx] suf = list_counts[lc_idx + 1] for p, y, t in zip(input_data['ID'][pre:suf], y_pred[pre:suf], y_true[pre:suf]): if p[0] not in res_scores: res_scores[p[0]] = {} res_scores[p[0]][p[1]] = (y, t) num_valid += len(list_counts) - 1 else: for k, eval_func in eval_metrics.items(): res[k] += eval_func(y_true=y_true, y_pred=y_pred) for p, y, t in zip(input_data['ID'], y_pred, y_true): if p[0] not in res_scores: res_scores[p[0]] = {} res_scores[p[0]][p[1]] = (y[1], t[1]) num_valid += 1 generator.reset() if tag in output_conf: if output_conf[tag]['save_format'] == 'TREC': with open(output_conf[tag]['save_path'], 'w') as f: for qid, dinfo in res_scores.items(): dinfo = sorted(dinfo.items(), key=lambda d: d[1][0], reverse=True) for inum, (did, (score, gt)) in enumerate(dinfo): f.write('%s\tQ0\t%s\t%d\t%f\t%s\t%s\n' % (qid, did, inum, score, config['net_name'], gt)) elif output_conf[tag]['save_format'] == 'TEXTNET': with open(output_conf[tag]['save_path'], 'w') as f: for qid, dinfo in res_scores.items(): dinfo = sorted(dinfo.items(), key=lambda d: d[1][0], reverse=True) for inum, (did, (score, gt)) in enumerate(dinfo): f.write('%s %s %s %s\n' % (gt, qid, did, score)) print('[Predict] results: ', '\t'.join(['%s=%f' % (k, v / num_valid) for k, v in res.items()]), end='\n') sys.stdout.flush()
def train(config): print(json.dumps(config, indent=2), end='\n') # read basic config global_conf = config["global"] optimizer = global_conf['optimizer'] optimizer = optimizers.get(optimizer) K.set_value(optimizer.lr, global_conf['learning_rate']) weights_file = str(global_conf['weights_file']) + '.%d' global logs_dir logs_dir = str(global_conf['logs']) if not os.path.exists(logs_dir): os.makedirs(logs_dir) display_interval = int(global_conf['display_interval']) num_iters = int(global_conf['num_iters']) save_weights_iters = int(global_conf['save_weights_iters']) # read input config input_conf = config['inputs'] share_input_conf = input_conf['share'] # collect embedding if 'embed_path' in share_input_conf: embed_dict = read_embedding(filename=share_input_conf['embed_path']) _PAD_ = share_input_conf['vocab_size'] - 1 embed_dict[_PAD_] = np.zeros((share_input_conf['embed_size'], ), dtype=np.float32) embed = np.float32( np.random.uniform(-0.2, 0.2, [ share_input_conf['vocab_size'], share_input_conf['embed_size'] ])) share_input_conf['embed'] = convert_embed_2_numpy(embed_dict, embed=embed) else: embed = np.float32( np.random.uniform(-0.2, 0.2, [ share_input_conf['vocab_size'], share_input_conf['embed_size'] ])) share_input_conf['embed'] = embed print('[Embedding] Embedding Load Done.', end='\n') # list all input tags and construct tags config input_train_conf = OrderedDict() input_eval_conf = OrderedDict() input_eval_loss_conf = OrderedDict() for tag in input_conf.keys(): if 'phase' not in input_conf[tag]: continue if input_conf[tag]['phase'] == 'TRAIN': input_train_conf[tag] = {} input_train_conf[tag].update(share_input_conf) input_train_conf[tag].update(input_conf[tag]) stats_for_plots[tag + '_loss'] = dict() elif input_conf[tag]['phase'] == 'EVAL': input_eval_conf[tag] = {} input_eval_conf[tag].update(share_input_conf) input_eval_conf[tag].update(input_conf[tag]) stats_for_plots[tag] = dict() elif input_conf[tag]['phase'] == 'EVAL_LOSS': input_eval_loss_conf[tag] = {} input_eval_loss_conf[tag].update(share_input_conf) input_eval_loss_conf[tag].update(input_conf[tag]) stats_for_plots[tag] = dict() print( '[Input] Process Input Tags. %s in TRAIN, %s in EVAL, %s in EVAL_LOSS.' % (input_train_conf.keys(), input_eval_conf.keys(), input_eval_loss_conf.keys()), end='\n') # collect dataset identification dataset = {} for tag in input_conf: if tag != 'share' and input_conf[tag]['phase'] == 'PREDICT': continue if 'text1_corpus' in input_conf[tag]: datapath = input_conf[tag]['text1_corpus'] if datapath not in dataset: dataset[datapath], _ = read_data(datapath) if 'text2_corpus' in input_conf[tag]: datapath = input_conf[tag]['text2_corpus'] if datapath not in dataset: dataset[datapath], _ = read_data(datapath) print('[Dataset] %s Dataset Load Done.' % len(dataset), end='\n') # initial data generator train_gen = OrderedDict() eval_gen = OrderedDict() eval_loss_gen = OrderedDict() for tag, conf in input_train_conf.items(): print(conf, end='\n') conf['data1'] = dataset[conf['text1_corpus']] conf['data2'] = dataset[conf['text2_corpus']] generator = inputs.get(conf['input_type']) train_gen[tag] = generator(config=conf) for tag, conf in input_eval_conf.items(): print(conf, end='\n') conf['data1'] = dataset[conf['text1_corpus']] conf['data2'] = dataset[conf['text2_corpus']] generator = inputs.get(conf['input_type']) eval_gen[tag] = generator(config=conf) for tag, conf in input_eval_loss_conf.items(): print(conf, end='\n') conf['data1'] = dataset[conf['text1_corpus']] conf['data2'] = dataset[conf['text2_corpus']] generator = inputs.get(conf['input_type']) eval_loss_gen[tag] = generator(config=conf) ######### Load Model ######### model = load_model(config) loss = [] for lobj in config['losses']: if lobj['object_name'] in mz_specialized_losses: loss.append( rank_losses.get(lobj['object_name'])(lobj['object_params'])) else: loss.append(rank_losses.get(lobj['object_name'])) for k, v in stats_for_plots.items(): if 'loss' in k: stats_for_plots[k][lobj['object_name']] = [] eval_metrics = OrderedDict() for mobj in config['metrics']: mobj = mobj.lower() if '@' in mobj: mt_key, mt_val = mobj.split('@', 1) eval_metrics[mobj] = metrics.get(mt_key)(int(mt_val)) else: eval_metrics[mobj] = metrics.get(mobj) for k, v in stats_for_plots.items(): if 'loss' not in k: stats_for_plots[k][mobj] = [] model.compile(optimizer=optimizer, loss=loss) print('[Model] Model Compile Done.', end='\n') for i_e in range(num_iters): for tag, generator in train_gen.items(): genfun = generator.get_batch_generator() evalfun = eval_loss_gen['test_loss'].get_batch_generator() print('*' * 100) history = model.fit_generator( genfun, steps_per_epoch=display_interval, epochs=1, shuffle=False, verbose=0, validation_data=evalfun, validation_steps=display_interval, callbacks=[ TrainValTensorBoard(log_dir=os.path.join( logs_dir, 'tensorboard'), global_step=display_interval * i_e, write_graph=False) ]) #callbacks=[eval_map]) for k, v in stats_for_plots.items(): if 'loss' in k: print('[%s]\t[Train:%s] ' % (time.strftime( '%m-%d-%Y %H:%M:%S', time.localtime(time.time())), k), end='') if any(srchstr in k for srchstr in ('test', 'val', 'valid')): _l = history.history['val_loss'][0] stats_for_plots[k][lobj['object_name']].append(_l) else: _l = history.history['loss'][0] stats_for_plots[k][lobj['object_name']].append(_l) print('Iter:%d\t Loss =%.6f' % (i_e, _l), end='\n') print('-' * 50) for tag, generator in eval_gen.items(): genfun = generator.get_batch_generator() print('[%s]\t[Eval:%s] ' % (time.strftime( '%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag), end='') res = dict([[k, 0.] for k in eval_metrics.keys()]) num_valid = 0 #history_eval = model.evaluate_generator(genfun, steps=1) #print("history_eval: {}".format(history_eval)) for input_data, y_true in genfun: y_pred = model.predict(input_data, batch_size=len(y_true)) if issubclass(type(generator), inputs.list_generator.ListBasicGenerator): list_counts = input_data['list_counts'] for k, eval_func in eval_metrics.items(): for lc_idx in range(len(list_counts) - 1): pre = list_counts[lc_idx] suf = list_counts[lc_idx + 1] res[k] += eval_func(y_true=y_true[pre:suf], y_pred=y_pred[pre:suf]) num_valid += len(list_counts) - 1 else: for k, eval_func in eval_metrics.items(): res[k] += eval_func(y_true=y_true, y_pred=y_pred) num_valid += 1 generator.reset() for k, v in res.items(): stats_for_plots[tag][k].append(v / num_valid) print('Iter:%d\t%s' % (i_e, '\t'.join( ['%s=%f' % (k, v / num_valid) for k, v in res.items()])), end='\n') sys.stdout.flush() if (i_e + 1) % save_weights_iters == 0: model.save_weights(weights_file % (i_e + 1)) export_loss(False, 1) export_metrics(False, 1)