def geometric_analysis(ann_file, container, out_file, highlighter): p, fn = os.path.split(ann_file) score_file = os.path.join('./summaries/', fn[0:fn.rfind('.')] + '_scores.json') scores = utils.load_json_data(score_file) sent_scores = {} for s in scores: sent_scores[s['sid']] = s anns = utils.load_json_data(ann_file) ht_obj = { 'total': len(anns), 'ht_sids': [], 'sect_dict': {}, 'sects': {}, 'page_dict': {}, 'total_page': 0, 'id': ann_file, 'sid_cat': {} } sect = '' last_sid = '' for ann in anns: if 'marked' in ann and len(ann['marked']) > 0: ht_obj['ht_sids'].append(ann['sid']) if 'struct' in ann: ht_obj['sect_dict'][ann['struct']] = [ann['sid']] if ann['struct'] not in ht_obj['sect_dict'] else \ ht_obj['sect_dict'][ann['struct']] + [ann['sid']] if 'page' in ann: ht_obj['page_dict'][ann['page']] = [ann['sid']] if ann['page'] not in ht_obj['page_dict'] else \ ht_obj['page_dict'][ann['page']] + [ann['sid']] ht_obj['sid_cat'][ann['sid']] = highlighter.get_sentence_cat_bd( sent_scores[ann['sid']]) if 'page' in ann: ht_obj['total_page'] = ann['page'] if ann['struct'] != sect: if sect.strip() != '': ht_obj['sects'][sect]['end'] = last_sid sect = ann['struct'] ht_obj['sects'][ann['struct']] = {'star': ann['sid']} last_sid = ann['sid'] if int(ann['sid']) > ht_obj['total']: ht_obj['total'] = int(ann['sid']) ht_obj['sects'][sect]['end'] = last_sid sum_file = os.path.join('./summaries/', fn[0:fn.rfind('.')] + '.sum') sum = utils.load_json_data(sum_file) if 'journal' in sum: ht_obj['journal'] = sum['journal'] else: ht_obj['journal'] = 'J.' container.append(ht_obj)
def get_ncbo_stats(ann_file, container): anns = utils.load_json_data(ann_file) onto2freq = {'ht': {}, 'nm': {}} total_nm = 0 total_ht = 0 for ann in anns: if 'marked' in ann: total_ht += 1 else: total_nm += 1 if 'ncbo' in ann: matched_ontos = [] for ncbo in ann['ncbo']: for name in pann.onto_name: if name not in matched_ontos and ncbo['uri'].startswith( pann.onto_name[name]): matched_ontos.append(name) if name in matched_ontos: break # for name in matched_ontos: # ctn = onto2freq['ht'] if 'marked' in ann else onto2freq['nm'] # ctn[name] = 1 if name not in ctn else 1 + ctn[name] if len(matched_ontos) > 0: comb = '-'.join(sorted(matched_ontos)) ctn = onto2freq['ht'] if 'marked' in ann else onto2freq['nm'] ctn[comb] = 1 if comb not in ctn else 1 + ctn[comb] container.append({ 'total_nm': total_nm, 'total_ht': total_ht, 'freqs': onto2freq })
def visualise_lp_ranged_stats(stat_file, cat, title, skips=None, score_output_file=None): r2stats = utils.load_json_data(stat_file) data = [] data2save = {} for r in r2stats: stats = r2stats[r] total_normal = stats['s_nm'] total_highlights = stats['s_ht'] keys, scores = score_language_patterns(stats['nm'][cat], stats['ht'][cat], total_normal, total_highlights) if score_output_file is None: trace1 = go.Bar(x=keys, y=scores, name=r) data.append(trace1) else: data2save[r] = {} for i in range(len(keys)): data2save[r][keys[i]] = scores[i] if score_output_file is None: layout = go.Layout(barmode='group', title=title) fig = go.Figure(data=data, layout=layout) py.plot(fig, filename='language pattern ranged stats - ' + cat) else: utils.save_json_array(data2save, score_output_file)
def visualise_categorised_geometric(geo_feature_file, fn): gms = utils.load_json_data(geo_feature_file) journal2cat = {} journal2papers = {} # cat_trace = {} for paper in gms: # j = paper['journal'] j = 'all' journal2cat[j] = {} if j not in journal2cat else journal2cat[j] cat_trace = journal2cat[j] journal2papers[j] = [j, 1] if j not in journal2papers else [ j, 1 + journal2papers[j][1] ] sects = paper['sect_dict'] sid_cat = paper['sid_cat'] for y in sects: for x in sects[y]: cat = sid_cat[x] if cat in ['cardinal nouns', 'named entities', 'general']: continue if cat not in cat_trace: cat_trace[cat] = {'x': [], 'y': []} trace = cat_trace[cat] trace['x'].append(1.0 * int(x) / int(paper['total'])) label_y = y.replace('deo:', '').replace('DoCO:', '').replace( 'BodyMatter', 'Others').replace('FrontMatter', 'Others') trace['y'].append(label_y) sorted_journals = sorted([journal2papers[j] for j in journal2papers], cmp=lambda jp1, jp2: jp2[1] - jp1[1]) print sorted_journals print len(sorted_journals) # selected_j = sorted_journals[1][0] selected_j = 'all' cat_trace = journal2cat[selected_j] # skip the no-journal paper group traces = [] for cat in cat_trace: traces.append( go.Scatter(x=cat_trace[cat]['x'], y=cat_trace[cat]['y'], mode='markers', name=cat)) # print traces layout = go.Layout( title= 'highlights over spatial dimensions', # selected_j + ' - language pattern breakdown', yaxis=dict(categoryorder='array', categoryarray=[ 'Introduction', 'Methods', 'Results', 'Discussion', 'Others' ])) fig = go.Figure(data=traces, layout=layout) # py.plot(fig, filename=fn) # + ' - ' + selected_j) py.image.save_as({ 'data': traces, 'layout': layout }, './results/spatial.pdf')
def paper_stat(ann_file, container): path, fn = utils.split(ann_file) sums = utils.load_json_data( utils.join('./20-test-papers/summaries/', fn[:fn.rfind('.')] + '.sum')) anns = utils.load_json_data(ann_file) total_ht = 0 for ann in anns: if 'marked' in ann: total_ht += 1 container.append({ 'f': ann_file, 'ht': total_ht, 'nm': len(anns) - total_ht, 'total': len(anns), 'PMID': sums['PMID'] if 'PMID' in sums else '', 'Journal': sums['journal'] if 'journal' in sums else '' })
def get_sp_ne_associations(score_file, container): scores = utils.load_json_data(score_file) sp2ne = {} for s in scores: p = s['pattern'] if 'sp_index' in p and p['sp_index'] > -1 and s['ne'] > 0: sp2ne[p['sp_index']] = 1 if p[ 'sp_index'] not in sp2ne else 1 + sp2ne[p['sp_index']] container.append(sp2ne)
def test_merge_highlight(): ht_file = './30-test-papers/11274654_ht.json' ann_file = './30-test-papers/11274654_annotated_ann.json' ann = util.load_json_data(ann_file) for a in ann: if 'marked' in a: a['marked'] = [] ht = read_highlights_json(ht_file) merge_highlights(ann, ht) util.save_json_array(ann, ann_file)
def summ(highlighter, ann_file, out_path): anns = utils.load_json_data(ann_file) p, fn = split(ann_file) score_file = join(out_path, fn[:fn.rfind('.')] + '_scores.json') sid_to_score = {} if isfile(score_file): stored_scores = utils.load_json_data(score_file) i = 1 for score in stored_scores: sid_to_score[score['sid']] = score i += 1 summary, scores = highlighter.summarise([s['text'] for s in anns], src=ann_file, sids=[s['sid'] for s in anns], score_dict=sid_to_score) # if not isfile(score_file): utils.save_json_array(scores, score_file) utils.save_json_array(summary, join(out_path, fn[:fn.rfind('.')] + '.sum'))
def get_general_highlights(): geos = utils.load_json_data('./training/geo_features.json') sents = [] for g in geos: f_ann = g['id'] sids = [] for sid in g['sid_cat']: if g['sid_cat'][sid] == 'general': sids.append(sid) if len(sids) > 0: anns = utils.load_json_data(f_ann) for ann in anns: if ann['sid'] in sids: sents.append({ 'text': ann['text'], 'marked': ann['marked'] if 'marked' in ann else '' }) utils.save_json_array(sents, './training/general_highlights.json')
def update_score_path_summ(score_path): # regenerate sum because of new score file after semantic fixing hter = ah.HighLighter.get_instance() sum_files = utils.filter_path_file(score_path, 'sum') for s in sum_files: pmcid = s[:s.rfind('_')] print join(score_path[:score_path.rfind('/summ')], pmcid + '_ann.json') ah.summ( hter, join(score_path[:score_path.rfind('/summ')], pmcid + '_ann.json'), score_path) update_paper_summ(pmcid, utils.load_json_data(join(score_path, s))) print 'paper %s summary uploaded' % pmcid
def summ_mt(ann_file, out_path): # test run nltk aa.extract_cd_nouns_nes('good 12 cn', {}, {}) anns = utils.load_json_data(ann_file) p, fn = split(ann_file) score_file = join(out_path, fn[:fn.rfind('.')] + '_scores.json') sid_to_score = {} if isfile(score_file): stored_scores = utils.load_json_data(score_file) i = 1 for score in stored_scores: sid_to_score[score['sid']] = score i += 1 sum_file = join(out_path, fn[:fn.rfind('.')] + '.sum') HighLighter.multithread_summ([s['text'] for s in anns], 6, score_file, sum_file, src=ann_file, sids=[s['sid'] for s in anns], score_dict=sid_to_score)
def load_ht_data(ann_file_path): score_files = [ join(ann_file_path, f) for f in listdir(ann_file_path) if isfile(join(ann_file_path, f)) and f.endswith('_annotated_ann.json') ] sents = [] for sf in score_files: sents += [{ 'text': so['text'], 'class': 'ht' if 'marked' in so else 'nht' } for so in (sos for sos in utils.load_json_data(sf))] return sents print 'total #sents %s \n top 1 is %s' % (len(sents), sents[0])
def get3DCords(score_file, container, out_file, hter): scores = utils.load_json_data(score_file) anns = utils.load_json_data(scores[0]['doc_id']) sids = [] for ann in anns: if 'marked' in ann: sids.append(ann['sid']) for s in scores: if s['sid'] not in sids: continue cat = hter.get_sp_type(s) p = s['pattern'] nes = sorted(list(set([k for k in p['nes']]))) cds = sorted(list(set([k for k in p['cds']]))) container.append({ 'x': cat, # 'N/A' if 'sp_index' not in p or p['sp_index'] == -1 else \ # '-'.join(p['sub'] if p['sub'] is not None else []) + ' ' + \ # '-'.join(p['pred'] if p['pred'] is not None else []), 'y': len(nes), 'z': len(cds) # 'y': 'N/A' if len(p['nes']) == 0 else ' '.join(nes), # 'z': 'N/A' if len(p['cds']) == 0 else ' '.join(cds), })
def paper_language_pattern_dist(score_file, container, hter, out_file): scores = utils.load_json_data(score_file) anns = utils.load_json_data(scores[0]['doc_id']) b_marked = False hts = [] for i in range(len(anns)): ann = anns[i] if 'marked' in ann: b_marked = True hts.append(ann['sid']) if not b_marked or 15 > len(hts) < 10: return max_sid = int(scores[len(scores) - 1]['sid']) stat = {'ht': {}, 'all': {}, 'max_sid': max_sid} for s in scores: all_sp_types = [] cat = hter.get_sp_type(s, all_types=all_sp_types) for t in all_sp_types: stat['all'][t] = 1 if t not in stat['all'] else 1 + stat['all'][t] if s['sid'] in hts: stat['ht'][t] = 1 if t not in stat['ht'] else 1 + stat['ht'][t] p = s['pattern'] if len(p['nes']) > 0: t = 'NE' stat['all'][t] = 1 if t not in stat['all'] else 1 + stat['all'][t] if s['sid'] in hts: stat['ht'][t] = 1 if t not in stat['ht'] else 1 + stat['ht'][t] if len(p['cds']) > 0: t = 'CDS' stat['all'][t] = 1 if t not in stat['all'] else 1 + stat['all'][t] if s['sid'] in hts: stat['ht'][t] = 1 if t not in stat['ht'] else 1 + stat['ht'][t] container.append(stat)
def append_abstract_label(xml_file): p, f = os.path.split(xml_file) ann_file = os.path.join(p, f[:f.rfind('.')] + '_ann.json') tree = ET.parse(xml_file) root = tree.getroot() abstracts = root.findall(".//abstract") if len(abstracts) > 0: ab_sents = abstracts[0].findall("s") max_ab_sid = int(ab_sents[len(ab_sents) - 1].attrib['sid']) if max_ab_sid >= 0: anns = util.load_json_data(ann_file) for ann in anns: if int(ann['sid']) <= max_ab_sid: ann['abstract-title'] = True util.save_json_array(anns, ann_file)
def compute_sp_type_statics(): sp2ratio = {} stats = utils.load_json_data( './training/language_pattern_stats_ranged.json') total = 0 for r in stats: total += stats[r]['s_ht'] + stats[r]['s_nm'] for p in stats[r]['ht']['sp']: print p, stats[r]['ht']['sp'][p] sp2ratio[ p] = stats[r]['ht']['sp'][p] if p not in sp2ratio else stats[ r]['ht']['sp'][p] + sp2ratio[p] print json.dumps(sp2ratio) for p in sp2ratio: sp2ratio[p] = sp2ratio[p] * 1.0 / total print json.dumps(sp2ratio)
def process_pmc_paper(pmcid, job_path, job_id): ann_file = join(job_path, pmcid + '_ann.json') if exists(ann_file): print '%s exists, skipping download' % ann_file update_paper_fulltext(pmcid, utils.load_json_data(ann_file)) return t = get_pmc_paper_fulltext(pmcid) if t is None or len(t) == 0: utils.append_text_file(pmcid, join(job_path, 'not_available.txt')) else: sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') sents = sent_detector.tokenize(t.strip()) if job_path is not None: fulltext = [{ 'text': sents[i], 'sid': str(i + 1) } for i in range(len(sents))] utils.save_json_array(fulltext, ann_file) update_paper_fulltext(pmcid, fulltext)
def compute_sp_type_regioned_weights(): sp2ratio = {} stats = utils.load_json_data( './training/language_pattern_stats_ranged.json') total = 0 for r in stats: total += stats[r]['s_ht'] + stats[r]['s_nm'] for p in stats[r]['ht']['sp']: sp2ratio[p] = {} if p not in sp2ratio else sp2ratio[p] sp2ratio[p][r] = stats[r]['ht']['sp'][p] sp2ratio[p]['max'] = stats[r]['ht']['sp'][p] \ if 'max' not in sp2ratio[p] or sp2ratio[p]['max'] < stats[r]['ht']['sp'][p] \ else sp2ratio[p]['max'] for p in sp2ratio: for k in sp2ratio[p]: m = sp2ratio[p]['max'] if k != 'max': sp2ratio[p][k] = 1.0 * sp2ratio[p][k] / m print json.dumps(sp2ratio)
def visualise_highlights_geometric(geo_feature_file, fn, cat): gms = utils.load_json_data(geo_feature_file) subplots = {} for paper in gms: j = paper['journal'] if j not in subplots: subplots[j] = [] traces = subplots[j] y_vals = [] x_vals = [] sects = paper['sect_dict'] sid_cat = paper['sid_cat'] for y in sects: for x in sects[y]: if sid_cat[x] == cat: x_vals.append(1.0 * int(x) / int(paper['total'])) y_vals.append(y) traces.append({'x': x_vals, 'y': y_vals}) plots = [] for j in subplots: if len(subplots[j]) >= 6 and j is not None: m_x = [] m_y = [] for d in subplots[j]: m_x += d['x'] m_y += d['y'] plots.append( go.Scatter(x=m_x, y=m_y, mode='markers', name=j if j is not None else 'unknown')) fig = tools.make_subplots(rows=len(plots), cols=1, shared_xaxes=True) for i in range(len(plots)): fig.append_trace(plots[i], i + 1, 1) fig['layout'].update(height=600, width=600) py.plot(fig, filename=fn)
def visualise_lp_stats(stat_file, cat, title, skips=None, score_output_file=None): stats = utils.load_json_data(stat_file) total_normal = stats['s_nm'] total_highlights = stats['s_ht'] keys, scores = score_language_patterns(stats['nm'][cat], stats['ht'][cat], total_normal, total_highlights) if score_output_file is None: trace1 = go.Bar(x=keys, y=scores, name='Highlighted Sentences / Other Sentences') data = [trace1] layout = go.Layout(barmode='group', title=title) fig = go.Figure(data=data, layout=layout) py.plot(fig, filename='language pattern stats - ' + cat) else: data = {} for i in range(len(keys)): data[keys[i]] = scores[i] utils.save_json_array(data, score_output_file)
def load_resources(ne_file, cd_file, sp_file, sf_nes, sf_cds, sf_sp, sf_ranged_nes, sf_ranged_cds, sf_ranged_sp, sp_cat_file=None): ne = read_text_res(ne_file) cd = read_text_res(cd_file) sp = read_sub_pred_file(sp_file) sp_cats = None if sp_cat_file is None else utils.load_json_data( sp_cat_file) scores_nes = utils.load_json_data(sf_nes) scores_cds = utils.load_json_data(sf_cds) scores_sp = utils.load_json_data(sf_sp) scores_ranged_nes = utils.load_json_data(sf_ranged_nes) scores_ranged_cds = utils.load_json_data(sf_ranged_cds) scores_ranged_sp = utils.load_json_data(sf_ranged_sp) return ne, cd, sp, sp_cats, \ scores_nes, scores_cds, scores_sp, \ scores_ranged_nes, scores_ranged_cds, scores_ranged_sp
def file_match_concepts(ann_file, concepts): anns = utils.load_json_data(ann_file) for ann in anns: ret = match_concepts(ann['text'], concepts) if len(ret) > 0: print ret, ann['sid']
def score_paper_threshold(score_file, container, out_file, hter, threshold, manual_ann=None): ma = None if manual_ann is not None: fpath, fn = split(score_file) m = re.match(r'(\d+)_annotated_ann_scores\.json', fn) if m is not None: paperid = m.group(1) if paperid in manual_ann: ma = manual_ann[paperid] units = 5 scores = utils.load_json_data(score_file) max_sid = int(scores[len(scores) - 1]['sid']) offset = int(math.ceil(1.0 * len(scores) / units)) anns = utils.load_json_data(scores[0]['doc_id']) hts = [] sid2ann = {} sid2onto = {} abstract_sents = [] for ann in anns: if ma is not None and 'max_abstract_sid' in ma and int( ann['sid']) <= ma['max_abstract_sid']: abstract_sents.append(ann['sid']) continue # skipe the abstract sentences # if 'abstract-title' in ann or ('struct' in ann and (ann['struct'] == 'DoCO:Abstract' or ann['struct'] == 'DoCO:Title')): # abstract_sents.append(ann['sid']) # continue # skipe the abstract sentences if 'marked' in ann: hts.append(ann['sid']) sid2ann[ann['sid']] = ann # skip papers with no highlights # if len(hts) == 0: # return if ma is not None: hts += [str(sid) for sid in ma['also_correct']] prediction = [] num_correct = 0 sentence_level_details = [] for i in range(len(scores)): r = (i + 1) / offset score = scores[i] if score['sid'] in abstract_sents: continue # skip the abstract sentences score_ret = hter.score(score, region='r' + str(r)) sent_type = '-'.join(sorted(score_ret['all_sps'])) onto2scores = HighLighter.get_onto_name_scores() onto_score = 0 if score['sid'] not in sid2onto else \ 0 if sid2onto[score['sid']] not in onto2scores \ else onto2scores[sid2onto[score['sid']]] confidence = 1 if 'confidence' not in score['pattern'] else score[ 'pattern']['confidence'] # if confidence < 1: # sent_type = '' if (len(score_ret['sp']) > 0) \ or (score_ret['cds'] + score_ret['nes'] > 0) \ or onto_score > .2: s_sp = 0.0 if len(score_ret['sp']) > 0: if len(score_ret['sp']) == 1: for t in score_ret['sp']: s_sp = score_ret['sp'][t] else: type_score = [] for t in score_ret['sp']: type_score.append([t, score_ret['sp'][t]]) type_score = sorted(type_score, cmp=lambda p1, p2: 1 if p2[1] > p1[1] else 0 if p2[1] == p1[1] else -1) s_sp = type_score[0][1] # average combination # s = (s_sp + score_ret['cds'] + score_ret['nes'])/3 # empirical setting s = 0.35 * s_sp + .2 * score_ret['cds'] + .45 * score_ret['nes'] # F2: voting enhancement voted = 0 if score_ret['nes'] > 0: voted += 1 if score_ret['cds'] > 0: voted += 1 if s_sp > 0: voted += 0.18 s *= voted / 2.18 # F3: type regional boosting (spatial features) type_boost = .3 if r in [0, 1] else .07 if r in [2, 3] else 0.005 region = 'r%s' % r sent_boost = HighLighter.get_sent_type_boost() if sent_type in sent_boost: type_boost = sent_boost[sent_type][ region] if region in sent_boost[sent_type] else 0.001 type_boost = math.pow(type_boost, 1.2) s *= type_boost * 10 prediction.append([score['sid'], s, sent_type]) if score['sid'] in hts or s > threshold: sentence_level_details.append( u'[{}]\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}'.format( score['sid'], 'H' if score['sid'] in hts else '-', 'P' if s > threshold else '-', sent_type, '{}/{}'.format(s, type_boost), '{}/{}'.format(s_sp, confidence), '{}/{}'.format( score_ret['cds'], score['pattern']['cds'] if 'cds' in score['pattern'] else ''), '{}/{}'.format( score_ret['nes'], score['pattern']['nes'] if 'nes' in score['pattern'] else ''), anns[i]['text'].replace('\n', '').replace('\t', ''))) else: if score['sid'] in hts: sentence_level_details.append( u'[{}]\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}'.format( score['sid'], 'H' if score['sid'] in hts else '-', '-', '-', '-', '-', '-', '-', anns[i]['text'].replace('\n', '').replace('\t', ''))) prediction = sort_by_threshold(prediction, threshold, cmp=lambda p1, p2: 1 if p2[1] > p1[1] else 0 if p2[1] == p1[1] else -1) for s in prediction: if s[0] in hts: num_correct += 1 container.append({ 'paper': scores[0]['doc_id'], 'predicted': len(prediction), 'correct': num_correct, 'hts': len(hts), 'max_sid': max_sid, 'highlights': prediction }) return sentence_level_details
def remove_ann_sentences(ann_file): anns = utils.load_json_data(ann_file) for ann in anns: ann['text'] = '' utils.save_json_array(anns, ann_file)
def get_sub_pred(): if HighLighter.sub_pred is None: HighLighter.sub_pred = utils.load_json_data( './resources/sub_pred.txt') return HighLighter.sub_pred
def get_sub_pred_ne_stat(): if HighLighter.sub_pred_ne_stat is None: HighLighter.sub_pred_ne_stat = utils.load_json_data( './resources/sub_pred_ne_stat.json') return HighLighter.sub_pred_ne_stat
def get_onto_name_scores(): if HighLighter.onto_name_scores is None: HighLighter.onto_name_scores = utils.load_json_data( './resources/score_ncbo_ontos.json') return HighLighter.onto_name_scores
def get_sent_type_boost(): if HighLighter.sent_type_boost is None: HighLighter.sent_type_boost = utils.load_json_data( './resources/sent_type_region_boost.json') return HighLighter.sent_type_boost
def get_manual_checked_result(): return utils.load_json_data(manual_file)
def get_language_pattern_stats(score_file, container, out_file, hter): scores = utils.load_json_data(score_file) max_sid = int(scores[len(scores) - 1]['sid']) units = 5 offset = int(1.0 * max_sid / units) anns = utils.load_json_data(scores[0]['doc_id']) b_marked = False ranges = [] r = {'sids': [], 's': 0, 'seq': 0} ranges.append(r) for i in range(len(anns)): if (i + 1) % offset == 0: r['e'] = i - 1 r = {'sids': [], 's': i, 'seq': (i + 1) / offset} ranges.append(r) ann = anns[i] if 'marked' in ann: b_marked = True r['sids'].append(ann['sid']) r['e'] = len(anns) - 1 if not b_marked: return for r in ranges: sids = r['sids'] stats = get_stats_obj() stats['s_nm'] = r['e'] - r['s'] - len(sids) stats['s_ht'] = len(sids) for i in range(r['s'], r['e']): s = scores[i] sent_type = 'ht' if s['sid'] in sids else 'nm' stat = stats[sent_type]['sp'] all_sp_types = [] cat = hter.get_sp_type(s, all_types=all_sp_types) if len(all_sp_types) > 0: t = '-'.join(sorted(all_sp_types)) stat[t] = 1 if t not in stat else 1 + stat[t] else: # count not typed as well stat[cat] = 1 if cat not in stat else 1 + stat[cat] p = s['pattern'] nes = sorted(list(set([k for k in p['nes']]))) cds = sorted(list(set([k for k in p['cds']]))) if len(all_sp_types) > 0: sp = '-'.join( p['sub'] if p['sub'] is not None else '') + ' ' + '-'.join( p['pred'] if p['pred'] is not None else '') stat = stats[sent_type]['sp_breakdown'] stat[sp] = 1 if sp not in stat else 1 + stat[sp] stat = stats[sent_type]['ne'] for ptn in nes: if ptn in hter.get_named_entities(): stat[ptn] = 1 if ptn not in stat else 1 + stat[ptn] stat = stats[sent_type]['cd'] for ptn in cds: if ptn in hter.get_cardinal_nouns(): stat[ptn] = 1 if ptn not in stat else 1 + stat[ptn] container.append({'r%s' % r['seq']: stats})