def indexed_weights(): global _indexed_weights if _indexed_weights is not None: return _indexed_weights print >> sys.stderr, len( wordmap.map), "=?=", HYPERPARAMETERS["VOCABULARY_SIZE"] assert len(wordmap.map) == HYPERPARAMETERS["VOCABULARY_SIZE"] if HYPERPARAMETERS["NGRAM_FOR_TRAINING_NOISE"] == 0: _indexed_weights = [1 for id in range(wordmap.len)] elif HYPERPARAMETERS["NGRAM_FOR_TRAINING_NOISE"] == 1: from common.json import load from common.file import myopen ngrams_file = HYPERPARAMETERS["NGRAMS"][( HYPERPARAMETERS["NGRAM_FOR_TRAINING_NOISE"], HYPERPARAMETERS["VOCABULARY_SIZE"])] print >> sys.stderr, "Reading ngrams from", ngrams_file, "..." from collections import defaultdict ngramcnt = defaultdict(int) for (ngram, cnt) in load(myopen(ngrams_file)): assert len(ngram) == 1 ngramcnt[ngram[0]] = cnt + HYPERPARAMETERS[ "TRAINING_NOISE_SMOOTHING_ADDITION"] _indexed_weights = [ ngramcnt[wordmap.str(id)] for id in range(len(wordmap.map)) ] _indexed_weights = build(_indexed_weights) else: assert 0 return _indexed_weights
def indexed_weights(): global _indexed_weights if _indexed_weights is not None: return _indexed_weights print >> sys.stderr, len(wordmap.map), "=?=", HYPERPARAMETERS["VOCABULARY_SIZE"] assert len(wordmap.map) == HYPERPARAMETERS["VOCABULARY_SIZE"] if HYPERPARAMETERS["NGRAM_FOR_TRAINING_NOISE"] == 0: _indexed_weights = [1 for id in range(wordmap.len)] elif HYPERPARAMETERS["NGRAM_FOR_TRAINING_NOISE"] == 1: from common.json import load from common.file import myopen ngrams_file = HYPERPARAMETERS["NGRAMS"][(HYPERPARAMETERS["NGRAM_FOR_TRAINING_NOISE"], HYPERPARAMETERS["VOCABULARY_SIZE"])] print >> sys.stderr, "Reading ngrams from", ngrams_file, "..." from collections import defaultdict ngramcnt = defaultdict(int) for (ngram, cnt) in load(myopen(ngrams_file)): assert len(ngram) == 1 ngramcnt[ngram[0]] = cnt + HYPERPARAMETERS["TRAINING_NOISE_SMOOTHING_ADDITION"] _indexed_weights = [ngramcnt[wordmap.str(id)] for id in range(len(wordmap.map))] _indexed_weights = build(_indexed_weights) else: assert 0 return _indexed_weights
bister = '(un|duo|tre|bis|qua|quin[tqu]*|sex|sept|octo?|novo?|non|dec|vic|ter|ies)+' re_alin_sup = re.compile(ur'supprimés?\)$', re.I) re_clean_alin = re.compile(r'^"?([IVXCDLM]+|\d+|[a-z]|[°)\-\.\s]+)+\s*((%s|[A-Z]+)[°)\-\.\s]+)*' % bister) re_upper_first = re.compile(r'^(.)(.*)$') step_id = '' old_step_id = '' for nstep, step in enumerate(steps): if not 'resulting_text_directory' in step: if step['stage'] not in [u"promulgation", u"constitutionnalité"]: sys.stderr.write("WARNING no directory found for step %s\n" % step['stage']) continue try: path = os.path.join(sourcedir, step['resulting_text_directory']) step_id = "%02d%s" % (nstep, step['directory'][2:]) with open(os.path.join(path, 'texte.json'), "r") as texte: data = json.load(texte) echec = (step['echec'] and step['echec'] != "renvoi en commission") if echec: if not 'echec' in out['articles']: out['articles']['echec'] = {'id': 'echec', 'titre': step['echec'], 'section': 'echec', 'steps': []} next_step = create_step(step_id, step['directory'], echec_type=step['echec']) out['articles']['echec']['steps'].append(next_step) if not 'echec' in out['sections']: out['sections']['echec'] = {} out['sections']['echec'][step_id] = {'title': data['expose'], 'type': step['echec'].upper()} continue for section in data['sections']: if not section['id'] in out['sections']: out['sections'][section['id']] = {} out['sections'][section['id']][step_id] = {'title': section['titre'], 'type': re_upper_first.sub(lambda x: x.group(1).upper() + x.group(2), section['type_section'])}
r'^"?([IVXCDLM]+|\d+|[a-z]|[°)\-\.\s]+)+\s*((%s|[A-Z]+)[°)\-\.\s]+)*' % bister) re_upper_first = re.compile(r'^(.)(.*)$') step_id = '' old_step_id = '' for nstep, step in enumerate(steps): if not 'resulting_text_directory' in step: if step['stage'] not in [u"promulgation", u"constitutionnalité"]: sys.stderr.write("WARNING no directory found for step %s\n" % step['stage']) continue try: path = os.path.join(sourcedir, step['resulting_text_directory']) step_id = "%02d%s" % (nstep, step['directory'][2:]) with open(os.path.join(path, 'texte.json'), "r") as texte: data = json.load(texte) echec = (step['echec'] and step['echec'] != "renvoi en commission") if echec: if not 'echec' in out['articles']: out['articles']['echec'] = { 'id': 'echec', 'titre': step['echec'], 'section': 'echec', 'steps': [] } next_step = create_step(step_id, step['directory'], echec_type=step['echec']) out['articles']['echec']['steps'].append(next_step) if not 'echec' in out['sections']:
sims = index[tfidf[bow_corpus]] # 计算相似性矩阵 i_cluster = graph_sim_matrix(sims, corr) author_cluster = [[ author_papers[index]['id'] for index in l_inside ] for l_inside in i_cluster] # res_realx={} # res_realx[author]=res_real[author] # print(author,'pairwise-f1',pairwise_f1(res_realx,{author:author_cluster})) print(i, author, '文章数', len(author_papers), '消歧后作者数', len(author_cluster)) res_dict[author] = author_cluster return res_dict if __name__ == "__main__": # 读取测试集数据 train_author_data = json.load( open(train_author_path, 'r', encoding='utf-8')) train_pub_data = json.load(open(train_pub_path, 'r', encoding='utf-8')) # 选择部分数据进行测试 author_list = list(train_author_data.keys()) author_selects = author_list[0:200] # 人工标识的分类结果 res_real = {} for author in author_selects: p_merge = [] for plist in train_author_data[author].values(): p_merge.append(plist) res_real[author] = p_merge papers = {}