break # if id < 200000: # output_dir = config['baseline_data_path'] + '/maui/ke20k/train(200k)/' # if not os.path.exists(output_dir): # os.makedirs(output_dir) # with open(output_dir+ str(id) + '.txt', 'w') as rf: # rf.write(record['title']+' \n '+record['abstract']) # with open(output_dir+ str(id) + '.key', 'w') as rf: # for k in record['keyword'].split(';'): # rf.write('%s\t1\n' % k) if __name__ == '__main__': # config = config.setup_keyphrase_all() config = config.setup_keyphrase_all() export_data_for_maui() ''' examine the data ''' # start_time = time.clock() # train_set, test_set, idx2word, word2idx = load_data_and_dict(config['training_dataset'], config['testing_dataset']) # serialize_to_file([train_set, test_set, idx2word, word2idx], config['dataset']) # print('Finish processing and dumping: %d seconds' % (time.clock()-start_time)) # # # export vocabulary to file for manual check # wordfreq = sorted(wordfreq.items(), key=lambda a: a[1], reverse=True) # serialize_to_file(wordfreq, config['voc']) # with open(config['path']+'/dataset/keyphrase/voc_list.json', 'w') as voc_file:
:return: ''' count_1 = 0 count_5 = 0 count_10 = 0 total = 0 max = 0. with open(filename, 'r') as f: for line in f: list = numpy.array(json.loads(line)) list = list.ravel() for e in list: total += 1 if abs(e) > 1: count_1 += 1 if abs(e) > 5: count_5 += 1 if abs(e) > 10: count_10 += 1 print(e) if abs(e) > max: max = abs(e) print('new max = %f' % e) print('total = %d' % total) print('count < 1/5/10 = %d / %d / %d' % (count_1, count_5, count_10)) print('max = %f' % max) if __name__ == '__main__': config = config.setup_keyphrase_all() # setup_keyphrase_inspec agent = Model() agent.load_weight_json(config['weight_json'])
def check_data(): config = setup_keyphrase_all() train_set, validation_set, test_set, idx2word, word2idx = deserialize_from_file( config['dataset']) for dataset_name in config['testing_datasets']: print('*' * 50) print(dataset_name) number_groundtruth = 0 number_present_groundtruth = 0 loader = testing_data_loader(dataset_name, kwargs=dict(basedir=config['path'])) if dataset_name == 'nus': docs = loader.get_docs(only_abstract=True, return_dict=False) else: docs = loader.get_docs(return_dict=False) stemmer = PorterStemmer() for id, doc in enumerate(docs): text_tokens = dataset_utils.get_tokens(doc.title.strip() + ' ' + doc.text.strip()) # if len(text_tokens) > 1500: # text_tokens = text_tokens[:1500] print('[%d] length= %d' % (id, len(doc.text))) stemmed_input = [ stemmer.stem(t).strip().lower() for t in text_tokens ] phrase_str = ';'.join([l.strip() for l in doc.phrases]) phrases = dataset_utils.process_keyphrase(phrase_str) targets = [[stemmer.stem(w).strip().lower() for w in target] for target in phrases] present_targets = [] for target in targets: keep = True # whether do filtering on groundtruth phrases. if config['target_filter']==None, do nothing match = None for i in range(len(stemmed_input) - len(target) + 1): match = None for j in range(len(target)): if target[j] != stemmed_input[i + j]: match = False break if j == len(target) - 1 and match == None: match = True break if match == True: # if match and 'appear-only', keep this phrase if config['target_filter'] == 'appear-only': keep = keep and True elif config['target_filter'] == 'non-appear-only': keep = keep and False elif match == False: # if not match and 'appear-only', discard this phrase if config['target_filter'] == 'appear-only': keep = keep and False # if not match and 'non-appear-only', keep this phrase elif config['target_filter'] == 'non-appear-only': keep = keep and True if not keep: continue present_targets.append(target) number_groundtruth += len(targets) number_present_groundtruth += len(present_targets) print('number_groundtruth=' + str(number_groundtruth)) print('number_present_groundtruth=' + str(number_present_groundtruth)) '''
for i in range(len(source_text) - 1): for j in range(i + 1, len(source_text)): if j - i > max_len: continue if j - i == 1 and (source_text[i:j] == '<digit>' or len(source_text[i:j][0]) == 1): continue tagseq = ''.join(source_postag[i:j]) if re.match(np_regex, tagseq): np_list.append((source_text[i:j], source_postag[i:j])) print('Text: \t\t %s' % str(source_text)) print('None Phrases:[%d] \n\t\t\t%s' % (len(np_list), str('\n\t\t\t'.join( [str(p[0]) + '[' + str(p[1]) + ']' for p in np_list])))) return np_list if __name__ == '__main__': config = setup_keyphrase_all() test_set = db.deserialize_from_file(config['path'] + '/dataset/keyphrase/' + config['data_process_name'] + 'semeval.testing.pkl') for s_index, s_str, s_tag in zip(test_set['source'], test_set['source_str'], [[s[1] for s in d] for d in test_set['tagged_source']]): get_none_phrases(s_str, s_tag, config['max_len'])