def load(self, filename): logger.info('load the weights.') # hdf5 module seems works abnormal !! # weights = dd.io.load(filename) weights = deserialize_from_file(filename) print len(weights) self.set_weights(weights)
def check_postag(config): train_set, validation_set, test_set, idx2word, word2idx = deserialize_from_file( config['dataset']) path = os.path.dirname(__file__) path = path[:path.rfind(os.sep, 0, len(path) - 10) + 1] + 'stanford-postagger/' jar = path + '/stanford-postagger.jar' model = path + '/models/english-bidirectional-distsim.tagger' pos_tagger = StanfordPOSTagger(model, jar) for dataset_name in config['testing_datasets']: # override the original test_set # test_set = load_testing_data(dataset_name, kwargs=dict(basedir=config['path']))(idx2word, word2idx, config['preprocess_type']) test_sets = load_additional_testing_data(config['testing_datasets'], idx2word, word2idx, config) test_set = test_sets[dataset_name] # print(dataset_name) # print('Avg length=%d, Max length=%d' % (np.average([len(s) for s in test_set['source']]), np.max([len(s) for s in test_set['source']]))) test_data_plain = zip(*(test_set['source'], test_set['target'])) test_size = len(test_data_plain) # Alternatively to setting the CLASSPATH add the jar and model via their path: jar = '/Users/memray/Project/stanford/stanford-postagger/stanford-postagger.jar' # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-left3words-distsim.tagger' model = '/Users/memray/Project/stanford/stanford-postagger/models/english-bidirectional-distsim.tagger' pos_tagger = StanfordPOSTagger(model, jar) for idx in xrange(len(test_data_plain)): # len(test_data_plain) test_s_o, test_t_o = test_data_plain[idx] source = keyphrase_utils.cut_zero(test_s_o, idx2word) print(source) # Add other jars from Stanford directory stanford_dir = jar.rpartition('/')[0] stanford_jars = find_jars_within_path(stanford_dir) pos_tagger._stanford_jar = ':'.join(stanford_jars) text = pos_tagger.tag(source) print(text)
def build_evaluation(train_set, segment): _, _, idx2word, word2idx = deserialize_from_file(train_set) pairs = [] f = open('./dataset/LCSTS/PART_III/PART_III.txt', 'r') line = f.readline().strip() lines = 0 segment = segment while line: if '<human_label>' in line: score = int(line[13]) if score >= 3: f.readline() summary = f.readline().strip().decode('utf-8') if segment: summary = [w for w in jb.cut(summary)] target = [] for w in summary: if w not in word2idx: word2idx[w] = len(word2idx) idx2word[len(idx2word)] = w target += [word2idx[w]] f.readline() f.readline() text = f.readline().strip().decode('utf-8') if segment: text = [w for w in jb.cut(text)] source = [] for w in text: if w not in word2idx: word2idx[w] = len(word2idx) idx2word[len(idx2word)] = w source += [word2idx[w]] pair = (text, summary, score, source, target) pairs.append(pair) lines += 1 if lines % 1000 == 0: print lines line = f.readline().strip() print 'lines={}'.format(len(pairs)) return pairs, word2idx, idx2word
def load_additional_testing_data(testing_names, idx2word, word2idx, config, postagging=True, process_type=1): test_sets = {} # rule out the ones appear in testing data for dataset_name in testing_names: if os.path.exists(config['path'] + '/dataset/keyphrase/'+config['data_process_name']+dataset_name+'.testing.pkl'): test_set = deserialize_from_file(config['path'] + '/dataset/keyphrase/'+config['data_process_name']+dataset_name+'.testing.pkl') print('Loading testing dataset %s from %s' % (dataset_name, config['path'] + '/dataset/keyphrase/'+config['data_process_name']+dataset_name+'.testing.pkl')) else: print('Creating testing dataset %s: %s' % (dataset_name, config['path'] + '/dataset/keyphrase/' + config[ 'data_process_name'] + dataset_name + '.testing.pkl')) dataloader = testing_data_loader(dataset_name, kwargs=dict(basedir=config['path'])) records = dataloader.get_docs() records, pairs, _ = utils.load_pairs(records, process_type=process_type, do_filter=False) test_set = utils.build_data(pairs, idx2word, word2idx) test_set['record'] = records if postagging: tagged_sources = get_postag_with_record(records, pairs) test_set['tagged_source'] = [[t[1] for t in s] for s in tagged_sources] if hasattr(dataloader, 'text_postag_dir') and dataloader.__getattribute__('text_postag_dir') != None: print('Exporting postagged data to %s' % (dataloader.text_postag_dir)) if not os.path.exists(dataloader.text_postag_dir): os.makedirs(dataloader.text_postag_dir) for r_, p_, s_ in zip(records, pairs, tagged_sources): with open(dataloader.text_postag_dir+ '/' + r_['name'] + '.txt', 'w') as f: output_str = ' '.join([w+'_'+t for w,t in s_]) f.write(output_str) else: print('text_postag_dir not found, no export of postagged data') serialize_to_file(test_set, config['path'] + '/dataset/keyphrase/'+config['data_process_name']+dataset_name+'.testing.pkl') test_sets[dataset_name] = test_set return test_sets
def export_krapivin_maui(): # prepare logging. config = keyphrase.config.setup_keyphrase_all() # load settings. train_set, validation_set, test_sets, idx2word, word2idx = deserialize_from_file( config['dataset']) test_sets = load_additional_testing_data(config['testing_datasets'], idx2word, word2idx, config) # keep the first 400 in krapivin dataset = test_sets['krapivin'] train_dir = '/Users/memray/Project/seq2seq-keyphrase/dataset/keyphrase/baseline-data/maui/krapivin/train/' if not os.path.exists(train_dir): os.makedirs(train_dir) train_texts = dataset['source_str'][401:] train_targets = dataset['target_str'][401:] for i, (train_text, train_target) in enumerate(zip(train_texts, train_targets)): print('train ' + str(i)) with open(train_dir + str(i) + '.txt', 'w') as f: f.write(' '.join(train_text)) with open(train_dir + str(i) + '.key', 'w') as f: f.write('\n'.join([' '.join(t) + '\t1' for t in train_target])) test_dir = '/Users/memray/Project/seq2seq-keyphrase/dataset/keyphrase/baseline-data/maui/krapivin/test/' if not os.path.exists(test_dir): os.makedirs(test_dir) test_texts = dataset['source_str'][:400] test_targets = dataset['target_str'][:400] for i, (test_text, test_target) in enumerate(zip(test_texts, test_targets)): print('test ' + str(i)) with open(test_dir + str(i) + '.txt', 'w') as f: f.write(' '.join(test_text)) with open(test_dir + str(i) + '.key', 'w') as f: f.write('\n'.join([' '.join(t) + '\t1' for t in test_target]))
def export_UTD(): # prepare logging. config = keyphrase.config.setup_keyphrase_all() # load settings. train_set, validation_set, test_sets, idx2word, word2idx = deserialize_from_file( config['dataset']) test_sets = load_additional_testing_data(config['testing_datasets'], idx2word, word2idx, config) for dataset_name, dataset in test_sets.items(): print('Exporting %s' % str(dataset_name)) # keep the first 400 in krapivin if dataset_name == 'krapivin': dataset['tagged_source'] = dataset['tagged_source'][:400] for i, d in enumerate( zip(dataset['tagged_source'], dataset['target_str'])): source_postag, target = d print('[%d/%d]' % (i, len(dataset['tagged_source']))) output_text = ' '.join( [sp[0] + '_' + sp[1] for sp in source_postag]) output_dir = config['baseline_data_path'] + dataset_name + '/text/' if not os.path.exists(output_dir): os.makedirs(output_dir) with open(output_dir + '/' + str(i) + '.txt', 'w') as f: f.write(output_text) output_text = '\n'.join([' '.join(t) for t in target]) tag_output_dir = config[ 'baseline_data_path'] + dataset_name + '/keyphrase/' if not os.path.exists(tag_output_dir): os.makedirs(tag_output_dir) with open(tag_output_dir + '/' + str(i) + '.txt', 'w') as f: f.write(output_text)
# prepare logging. tmark = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time())) config = setup() # load settings. for w in config: print '{0}={1}'.format(w, config[w]) logger = init_logging(config['path_log'] + '/experiments.CopyWeibo.id={}.log'.format(tmark)) n_rng = np.random.RandomState(config['seed']) np.random.seed(config['seed']) rng = RandomStreams(n_rng.randint(2**30)) logger.info('Start!') train_set, test_set, idx2word, word2idx = deserialize_from_file( config['dataset']) if config['voc_size'] == -1: # not use unk config['enc_voc_size'] = max(zip(*word2idx.items())[1]) + 1 config['dec_voc_size'] = config['enc_voc_size'] else: config['enc_voc_size'] = config['voc_size'] config['dec_voc_size'] = config['enc_voc_size'] samples = len(train_set['source']) logger.info('build dataset done. ' + 'dataset size: {} ||'.format(samples) + 'vocabulary size = {0}/ batch size = {1}'.format( config['dec_voc_size'], config['batch_size'])) def build_data(data):
def check_data(): config = setup_keyphrase_all() train_set, validation_set, test_set, idx2word, word2idx = deserialize_from_file( config['dataset']) for dataset_name in config['testing_datasets']: print('*' * 50) print(dataset_name) number_groundtruth = 0 number_present_groundtruth = 0 loader = testing_data_loader(dataset_name, kwargs=dict(basedir=config['path'])) if dataset_name == 'nus': docs = loader.get_docs(only_abstract=True, return_dict=False) else: docs = loader.get_docs(return_dict=False) stemmer = PorterStemmer() for id, doc in enumerate(docs): text_tokens = dataset_utils.get_tokens(doc.title.strip() + ' ' + doc.text.strip()) # if len(text_tokens) > 1500: # text_tokens = text_tokens[:1500] print('[%d] length= %d' % (id, len(doc.text))) stemmed_input = [ stemmer.stem(t).strip().lower() for t in text_tokens ] phrase_str = ';'.join([l.strip() for l in doc.phrases]) phrases = dataset_utils.process_keyphrase(phrase_str) targets = [[stemmer.stem(w).strip().lower() for w in target] for target in phrases] present_targets = [] for target in targets: keep = True # whether do filtering on groundtruth phrases. if config['target_filter']==None, do nothing match = None for i in range(len(stemmed_input) - len(target) + 1): match = None for j in range(len(target)): if target[j] != stemmed_input[i + j]: match = False break if j == len(target) - 1 and match == None: match = True break if match == True: # if match and 'appear-only', keep this phrase if config['target_filter'] == 'appear-only': keep = keep and True elif config['target_filter'] == 'non-appear-only': keep = keep and False elif match == False: # if not match and 'appear-only', discard this phrase if config['target_filter'] == 'appear-only': keep = keep and False # if not match and 'non-appear-only', keep this phrase elif config['target_filter'] == 'non-appear-only': keep = keep and True if not keep: continue present_targets.append(target) number_groundtruth += len(targets) number_present_groundtruth += len(present_targets) print('number_groundtruth=' + str(number_groundtruth)) print('number_present_groundtruth=' + str(number_present_groundtruth)) '''
word2idx['<eol>'] = 0 word2idx['<unk>'] = 1 idx2word = {word2idx[w]: w for w in word2idx} voc = ['<eol>', '<unk>'] + voc # word2idx['X'] = len(voc) # idx2word[len(voc)] = 'X' # voc += ['X'] # # word2idx['Y'] = len(voc) # idx2word[len(voc)] = 'Y' # voc += ['Y'] # print word2idx['X'], word2idx['Y'] # load the dataset Rules, _ = deserialize_from_file( '/home/thoma/Work/Dial-DRL/dataset/rules.rnd.n10k.pkl') num = 200 repeats = 100 maxleg = 15 Lmax = len(idx2word) rules = dict(source=Rules['source'][:num], target=Rules['target'][:num]) def ftr(v): if v < 10: return '00' + str(v) elif v < 100: return '0' + str(v) else: return str(v)
for i in range(len(source_text) - 1): for j in range(i + 1, len(source_text)): if j - i > max_len: continue if j - i == 1 and (source_text[i:j] == '<digit>' or len(source_text[i:j][0]) == 1): continue tagseq = ''.join(source_postag[i:j]) if re.match(np_regex, tagseq): np_list.append((source_text[i:j], source_postag[i:j])) print('Text: \t\t %s' % str(source_text)) print('None Phrases:[%d] \n\t\t\t%s' % (len(np_list), str('\n\t\t\t'.join( [str(p[0]) + '[' + str(p[1]) + ']' for p in np_list])))) return np_list if __name__ == '__main__': config = setup_keyphrase_all() test_set = db.deserialize_from_file(config['path'] + '/dataset/keyphrase/' + config['data_process_name'] + 'semeval.testing.pkl') for s_index, s_str, s_tag in zip(test_set['source'], test_set['source_str'], [[s[1] for s in d] for d in test_set['tagged_source']]): get_none_phrases(s_str, s_tag, config['max_len'])
config['task_name'], config['timemark'])) n_rng = np.random.RandomState(config['seed']) np.random.seed(config['seed']) rng = RandomStreams(n_rng.randint(2**30)) logger.info('*' * 20 + ' config information ' + '*' * 20) # print config information for k, v in config.items(): logger.info("\t\t\t\t%s : %s" % (k, v)) logger.info('*' * 50) # data is too large to dump into file, so has to load from raw dataset directly # train_set, test_set, idx2word, word2idx = keyphrase_dataset.load_data_and_dict(config['training_dataset'], config['testing_dataset']) train_set, validation_set, test_sets, idx2word, word2idx = deserialize_from_file( config['dataset']) test_sets = keyphrase_test_dataset.load_additional_testing_data( ['inspec'], idx2word, word2idx, config, postagging=False) logger.info('#(training paper)=%d' % len(train_set['source'])) logger.info('#(training keyphrase)=%d' % sum([len(t) for t in train_set['target']])) logger.info( '#(testing paper)=%d' % sum([len(test_set['target']) for test_set in test_sets.values()])) logger.info('Load data done.') if config['voc_size'] == -1: # not use unk config['enc_voc_size'] = max(list(zip(*word2idx.items()))[1]) + 1 config['dec_voc_size'] = config['enc_voc_size']
r_array.append(0) # print(entry[0], entry[0], entry[1]) ndcg = ndcg_at_k(r_array, k) # print(r_array) # print(ndcg) ndcg_total += ndcg return float(ndcg_total) / len(testing_data) if __name__ == '__main__': config = setup_keyphrase_all() # load settings. training_data_dict, testing_data_list = load_evaluation_data( config['path'] + '/dataset/textbook_linking/ir/') docs = deserialize_from_file(config['path'] + '/dataset/textbook_linking/docs.pkl') iir_docs = [d for d in docs if d['name'].startswith('iir')] mir_docs = [ d for d in docs if d['name'].startswith('mir') and d['name'] != 'mir_10_5_2' ] mir_name_map = load_mir_names() for d in mir_docs: # print('%s -> %s' % (d['name'], mir_name_map[d['name']])) d['name'] = mir_name_map[d['name']] similarity_matrix = {} encoding_name = 'backward'
# prepare logging. tmark = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time())) config = setup() # load settings. for w in config: print '{0}={1}'.format(w, config[w]) logger = init_logging(config['path_log'] + '/emolga.RHM.id={}.log'.format(tmark)) n_rng = np.random.RandomState(config['seed']) np.random.seed(config['seed']) rng = RandomStreams(n_rng.randint(2**30)) logger.info('Start!') # load the dataset and build a fuel-dataset. idx2word, word2idx = deserialize_from_file(config['vocabulary_set']) config['enc_voc_size'] = max(zip(*word2idx.items())[1]) + 1 config['dec_voc_size'] = config['enc_voc_size'] logger.info( 'build dataset done. vocabulary size = {0}/ batch size = {1}'.format( config['dec_voc_size'], config['batch_size'])) # training & valid & tesing set. train_set, train_size = build_fuel(deserialize_from_file(config['dataset'])) valid_set, valid_size = build_fuel( deserialize_from_file(config['dataset_test'])) # use test set for a try # weiget save. savefile = config['path_h5'] + '/emolga.RHM.id={}.h5'.format(tmark) # build the agent
from emolga.dataset.build_dataset import deserialize_from_file, serialize_to_file from keyphrase.config import * config = setup_keyphrase_all # setup_keyphrase_all_testing __author__ = "Rui Meng" __email__ = "*****@*****.**" if __name__ == '__main__': config = setup_keyphrase_all() # load settings. loader = testing_data_loader('irbooks', kwargs=dict(basedir=config['path'])) docs = loader.get_docs(return_dict=True) train_set, validation_set, test_sets, idx2word, word2idx = deserialize_from_file( config['dataset']) test_sets = load_additional_testing_data(config['testing_datasets'], idx2word, word2idx, config, postagging=False, process_type=2) test_set, test_s_list, test_t_list, test_s_o_list, test_t_o_list, input_encodings, predictions, scores, output_encodings, idx2word \ = deserialize_from_file(config['predict_path'] + 'predict.{0}.{1}.pkl'.format(config['predict_type'], 'irbooks')) do_stem = False # Evaluation outs, overall_score = keyphrase_utils.evaluate_multiple( config,
return logging # prepare logging. config = setup() # load settings. for w in config: print '{0}={1}'.format(w, config[w]) tmark = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time())) logger = init_logging(config['path_log'] + '/experiments.LCSTS.Eval.id={}.log'.format(tmark)) n_rng = np.random.RandomState(config['seed']) np.random.seed(config['seed']) rng = RandomStreams(n_rng.randint(2 ** 30)) logger.info('Start!') segment = config['segment'] word_set, char_set, word_voc, char_voc = deserialize_from_file('./dataset/lcsts_evaluate_data.pkl') if segment: eval_set = word_set word2idx, idx2word = word_voc else: eval_set = char_set word2idx, idx2word = char_voc if config['voc_size'] == -1: # not use unk config['enc_voc_size'] = len(word2idx) config['dec_voc_size'] = config['enc_voc_size'] else: config['enc_voc_size'] = config['voc_size'] config['dec_voc_size'] = config['enc_voc_size']
# prepare logging. tmark = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time())) config = setup() # load settings. for w in config: print '{0}={1}'.format(w, config[w]) logger = init_logging(config['path_log'] + '/experiments.Copy.id={}.log'.format(tmark)) n_rng = np.random.RandomState(config['seed']) np.random.seed(config['seed']) rng = RandomStreams(n_rng.randint(2**30)) logger.info('Start!') idx2word, word2idx, idx2word_o, word2idx_o \ = deserialize_from_file(config['voc']) idx2word_o[0] = '<eol>' word2idx_o['<eol>'] = 0 source, target, origin = deserialize_from_file(config['dataset']) samlpes = len(source) config['enc_voc_size'] = max(zip(*word2idx.items())[1]) + 1 config['dec_voc_size'] = config['enc_voc_size'] logger.info('build dataset done. ' + 'dataset size: {} ||'.format(samlpes) + 'vocabulary size = {0}/ batch size = {1}'.format( config['dec_voc_size'], config['batch_size'])) def build_data(source, target): # create fuel dataset.
np.random.seed(config['seed']) rng = RandomStreams(n_rng.randint(2**30)) logger.info('Start!') # the vocabulary tmp = [chr(x) for x in range(48, 58)] # '1', ... , '9', '0' voc = [ tmp[a] + tmp[b] + tmp[c] for c in xrange(10) for b in xrange(10) for a in xrange(10) ] word2idx = {voc[k]: k + 1 for k in xrange(len(voc))} word2idx['<eol>'] = 0 idx2word = {word2idx[w]: w for w in word2idx} voc = ['<eol>'] + voc train_set, test_set = deserialize_from_file(config['dataset']) config['enc_voc_size'] = max(zip(*word2idx.items())[1]) + 1 config['dec_voc_size'] = config['enc_voc_size'] samples = len(train_set['source']) logger.info('build dataset done. ' + 'dataset size: {} ||'.format(samples) + 'vocabulary size = {0}/ batch size = {1}'.format( config['dec_voc_size'], config['batch_size'])) def build_data(data): # create fuel dataset. dataset = datasets.IndexableDataset(indexables=OrderedDict([( 'source', data['source']), ( 'target', data['target']), (