def save(self, filename): if hasattr(self, 'save_parm'): params = self.params + self.save_parm else: params = self.params ps = 'save: <\n' for p in params: ps += '{0}: {1}\n'.format(p.name, p.eval().shape) ps += '> to ... {}'.format(filename) # logger.info(ps) # hdf5 module seems works abnormal !! # dd.io.save(filename, self.get_weights()) serialize_to_file(self.get_weights(), filename)
def load_additional_testing_data(testing_names, idx2word, word2idx, config, postagging=True, process_type=1): test_sets = {} # rule out the ones appear in testing data for dataset_name in testing_names: if os.path.exists(config['path'] + '/dataset/keyphrase/'+config['data_process_name']+dataset_name+'.testing.pkl'): test_set = deserialize_from_file(config['path'] + '/dataset/keyphrase/'+config['data_process_name']+dataset_name+'.testing.pkl') print('Loading testing dataset %s from %s' % (dataset_name, config['path'] + '/dataset/keyphrase/'+config['data_process_name']+dataset_name+'.testing.pkl')) else: print('Creating testing dataset %s: %s' % (dataset_name, config['path'] + '/dataset/keyphrase/' + config[ 'data_process_name'] + dataset_name + '.testing.pkl')) dataloader = testing_data_loader(dataset_name, kwargs=dict(basedir=config['path'])) records = dataloader.get_docs() records, pairs, _ = utils.load_pairs(records, process_type=process_type, do_filter=False) test_set = utils.build_data(pairs, idx2word, word2idx) test_set['record'] = records if postagging: tagged_sources = get_postag_with_record(records, pairs) test_set['tagged_source'] = [[t[1] for t in s] for s in tagged_sources] if hasattr(dataloader, 'text_postag_dir') and dataloader.__getattribute__('text_postag_dir') != None: print('Exporting postagged data to %s' % (dataloader.text_postag_dir)) if not os.path.exists(dataloader.text_postag_dir): os.makedirs(dataloader.text_postag_dir) for r_, p_, s_ in zip(records, pairs, tagged_sources): with open(dataloader.text_postag_dir+ '/' + r_['name'] + '.txt', 'w') as f: output_str = ' '.join([w+'_'+t for w,t in s_]) f.write(output_str) else: print('text_postag_dir not found, no export of postagged data') serialize_to_file(test_set, config['path'] + '/dataset/keyphrase/'+config['data_process_name']+dataset_name+'.testing.pkl') test_sets[dataset_name] = test_set return test_sets
T += [ftr(v) for v in X] elif w is 'Y': T += [ftr(v) for v in Y] else: T += [w] A = [word2idx[w] for w in S] B = [word2idx[w] for w in T] C = [0 if w not in S else S.index(w) + Lmax for w in T] instance['x'] += [S] instance['y'] += [T] instance['source'] += [A] instance['target'] += [B] instance['target_c'] += [C] instance['rule_id'] += [k] instance['rule'] += [' '.join(source) + ' -> ' + ' '.join(target)] return instance train_set = build_instance() print 'build ok.' test_set = build_instance() print 'build ok.' serialize_to_file([train_set, test_set, idx2word, word2idx], '/home/thoma/Work/Dial-DRL/dataset/synthetic_data_c.pkl') # serialize_to_file([train_set, test_set], '/home/thoma/Work/Dial-DRL/dataset/synthetic_data.pkl')
# 5. Save model if batch_id % 500 == 0 and batch_id > 1: # save the weights every K rounds agent.save( config['path_experiment'] + '/experiments.{0}.id={1}.epoch={2}.batch={3}.pkl'. format(config['task_name'], config['timemark'], epoch, batch_id)) # save the game(training progress) in case of interrupt! optimizer_config = agent.optimizer.get_config() serialize_to_file( [ name_ordering, batch_id, loss, valid_param, optimizer_config ], config['path_experiment'] + '/save_training_status.id={0}.epoch={1}.batch={2}.pkl'. format(config['timemark'], epoch, batch_id)) print(optimizer_config) # agent.save_weight_json(config['path_experiment'] + '/weight.print.id={0}.epoch={1}.batch={2}.json'.format(config['timemark'], epoch, batch_id)) # 6. Stop if exceed patience if valid_param['valids_not_improved'] >= valid_param[ 'patience']: print("Not improved for %s epochs. Stopping..." % valid_param['valids_not_improved']) valid_param['early_stop'] = True break ''' test accuracy and f-score at the end of each epoch
# use character-based model [on] # use word-based model [off] def build_data(data): instance = dict(text=[], summary=[], source=[], target=[], target_c=[]) for pair in data: source, target = pair A = [word2idx[w] for w in source] B = [word2idx[w] for w in target] # C = np.asarray([[w == l for w in source] for l in target], dtype='float32') C = [0 if w not in source else source.index(w) + Lmax for w in target] instance['text'] += [source] instance['summary'] += [target] instance['source'] += [A] instance['target'] += [B] # instance['cc_matrix'] += [C] instance['target_c'] += [C] # print instance['target'][5000] # print instance['target_c'][5000] return instance train_set = build_data(training) test_set = build_data(testing) serialize_to_file([train_set, test_set, idx2word, word2idx], './dataset/geo880/data-word-full.pkl')
mir_vec = mir_doc['backward_encoding'] if encoding_name == 'forward-backward': iir_vec = np.concatenate([ iir_doc['forward_encoding'], iir_doc['backward_encoding'] ]) mir_vec = np.concatenate([ mir_doc['forward_encoding'], mir_doc['backward_encoding'] ]) sim = cosine_similarity(iir_vec, mir_vec) similarity_matrix[iir_doc['name']].append( (mir_doc['name'], sim)) # print('%s vs %s = %f' % (iir_doc['name'], mir_doc['name'], sim)) serialize_to_file(similarity_matrix, similarity_matrix_file) for k in [1, 3, 5]: ndcg_k = 0 for testing_data_i in testing_data_list: ''' each testing_data_i consists of a bunch of mappings ''' # print(len(testing_data_i)) ndcg_ = evaluate_ndcg_at_k(testing_data_i, k) ndcg_k += ndcg_ print( 'NDCG@%d = %f/%d = %f' % (k, ndcg_k, len(testing_data_list), float(ndcg_k) / len(testing_data_list)))
print idx2word[i].encode('utf-8') # use character-based model [on] # use word-based model [off] def build_data(data): instance = dict(text=[], summary=[], source=[], target=[], target_c=[]) for pair in data: source, target = pair A = [word2idx[w] for w in source] B = [word2idx[w] for w in target] # C = np.asarray([[w == l for w in source] for l in target], dtype='float32') C = [0 if w not in source else source.index(w) + Lmax for w in target] instance['text'] += [source] instance['summary'] += [target] instance['source'] += [A] instance['target'] += [B] # instance['cc_matrix'] += [C] instance['target_c'] += [C] print instance['target'][5000] print instance['target_c'][5000] return instance train_set = build_data(pairs[10000:]) test_set = build_data(pairs[:10000]) serialize_to_file([train_set, test_set, idx2word, word2idx], './dataset/weibo_data-word-cooc.pkl')
def obtain_dataset(): rfile = open('/home/thoma/Work/Dial-DRL/dataset/BST_1M.txt', 'r') line = rfile.readline() word2idx = dict() word2idx['<eol>'] = 0 word2idx['<unk>'] = 1 pairs = [] at = 2 lines = 0 while line: lines += 1 line = line.strip() source, target = line.split('->') source = source.split() target = target.split() for w in source: if w not in word2idx: word2idx[w] = at at += 1 for w in target: if w not in word2idx: word2idx[w] = at at += 1 pairs.append((source, target)) if lines % 20000 == 0: print lines line = rfile.readline() idx2word = dict() for v, k in word2idx.items(): idx2word[k] = v Lmax = len(idx2word) print 'read dataset ok.' print Lmax for i in xrange(Lmax): print idx2word[i] def build_data(data): instance = dict(text=[], summary=[], source=[], target=[], target_c=[]) for pair in data: source, target = pair A = [word2idx[w] for w in source] B = [word2idx[w] for w in target] # C = np.asarray([[w == l for w in source] for l in target], dtype='float32') C = [0 if w not in source else source.index(w) + Lmax for w in target] instance['text'] += [source] instance['summary'] += [target] instance['source'] += [A] instance['target'] += [B] # instance['cc_matrix'] += [C] instance['target_c'] += [C] print instance['target'][5000] print instance['target_c'][5000] return instance train_set = build_data(pairs[100000:]) test_set = build_data(pairs[:100000]) serialize_to_file([train_set, test_set, idx2word, word2idx], '/home/thoma/Work/Dial-DRL/dataset/BST_1M.data.pkl')
else: return word # prepare the vocabulary data_clean = [[replace(w) for w in l] for l in data_rep] idx2word2 = dict(enumerate(set([w for l in data_clean for w in l]), 1)) idx2word2[0] = '<eol>' word2idx2 = {v: k for k, v in idx2word2.items()} Lmax = len(idx2word2) for k in xrange(len(idx2word2)): print k, '\t', idx2word2[k] print 'Max: {}'.format(Lmax) serialize_to_file([idx2word2, word2idx2, idx2word, word2idx], './dataset/bAbI/voc-b.pkl') # get ready for the dataset. source = [[word2idx2[w] for w in l] for l in data_clean] target = [[ word2idx2[w] if w not in ['<person>', '<color>', '<shape>'] else it + Lmax for it, w in enumerate(l) ] for l in data_clean] def print_str(data): for d in data: print ' '.join(str(w) for w in d) print_str(data[10000:10005])
print idx2word[i].encode('utf-8') # use character-based model [on] # use word-based model [off] def build_data(data): instance = dict(text=[], summary=[], source=[], target=[], target_c=[]) for pair in data: source, target = pair A = [word2idx[w] for w in source] B = [word2idx[w] for w in target] # C = np.asarray([[w == l for w in source] for l in target], dtype='float32') C = [0 if w not in source else source.index(w) + Lmax for w in target] instance['text'] += [source] instance['summary'] += [target] instance['source'] += [A] instance['target'] += [B] # instance['cc_matrix'] += [C] instance['target_c'] += [C] print instance['target'][5000] print instance['target_c'][5000] return instance train_set = build_data(pairs) test_set = build_data(tests) serialize_to_file([train_set, test_set, idx2word, word2idx], './dataset/lcsts_data-char-full.pkl')
print pairs[0] def build_data(data): instance = dict(text=[], summary=[], source=[], target=[], target_c=[]) print len(data) for pair in data: source, target = pair A = [word2idx[w] for w in source] B = [word2idx[w] for w in target] # C = np.asarray([[w == l for w in source] for l in target], dtype='float32') C = [0 if w not in source else source.index(w) + Lmax for w in target] instance['text'] += [source] instance['summary'] += [target] instance['source'] += [A] instance['target'] += [B] # instance['cc_matrix'] += [C] instance['target_c'] += [C] print instance['source'][4000] print instance['target'][4000] print instance['target_c'][4000] return instance train_set = build_data(pairs[10000:]) test_set = build_data(pairs[:10000]) serialize_to_file([train_set, test_set, idx2word, word2idx], './dataset/movie_dialogue_data.pkl')
print('Generating idx2word2 and word2idx2') # Replace name/color/shape to <tag> data_clean = [[replace(w) for w in l] for l in data_rep] idx2word2 = dict(enumerate(set([w for l in data_clean for w in l]), 1)) idx2word2[0] = '<eol>' word2idx2 = {v: k for k, v in idx2word2.items()} Lmax = len(idx2word2) for k in xrange(len(idx2word2)): print k, '\t', idx2word2[k] print 'Max: {}'.format(Lmax) # idx2word(3) and word2idx(4) are from source, size=132 # idx2word2(1) and word2idx2(2) are from target, replacing name/color/shape to <tag>, size=98 print('Exporing dicts to file') serialize_to_file([idx2word2, word2idx2, idx2word, word2idx], config['voc']) print('Generating source, target, origin') # get ready for the dataset. # source sequence is the sentence with all the real people/colors/shapes converted into tags '<person>', '<color>', '<shape>' source = [[word2idx2[w] for w in l] for l in data_clean] # in target, if a word is not person/color/shape, then keep it, else change it to it + Lmax (a word not present in dict) target = [[ word2idx2[w] if w not in ['<person>', '<color>', '<shape>'] else it + Lmax for it, w in enumerate(l) ] for l in data_clean] def print_str(data): for d in data: print ' '.join(str(w) for w in d)