def read_cwms_sentences(text_dict, read=True): """Read specific sentences with CWMS.""" bl = 20 stmts = [] for doc, sentences in text_dict.items(): blocks = [ sentences[i * bl:i * bl + bl] for i in range(math.ceil(len(sentences) / bl)) ] for j, block in enumerate(blocks): block_txt = '.\n'.join(t.capitalize() for t in block) block_txt = preprocess_cwms(block_txt) if len(blocks) == 1: ekb_fname = 'cwms/%s_sentences.ekb' % doc else: ekb_fname = 'cwms/%s_sentences_%d.ekb' % (doc, j) if os.path.exists(ekb_fname): with open(ekb_fname, 'r') as fh: cp = cwms.process_ekb(fh.read()) elif read: print('Reading into %s' % ekb_fname) cp = cwms.process_text(block_txt, save_xml=ekb_fname) else: continue #print('%d stmts from %s %d' % # (len(cp.statements), ekb_fname, j)) stmts += cp.statements # Set the PMID on these statements so that we can get the document ID # during assembly for stmt in cp.statements: stmt.evidence[0].pmid = doc return stmts
def read_cwms_ekbs(docnames): stmts = [] for docname in docnames: for fname in glob.glob('cwms/%s_sentences*.ekb' % docname): with open(fname, 'r') as fh: cp = cwms.process_ekb(fh.read()) stmts += cp.statements return stmts
def read_cwms_full(fnames, read=True): """Read full texts with CWMS.""" def get_paragraphs(txt): # Break up along blank lines parts = txt.split('\n\n') # Consider a part a paragraph if it's at least 3 lines long paras = [p for p in parts if len(p.split('\n')) >= 3] return paras stmts = [] for fname in fnames: basename = '.'.join(fname.split('.')[:-1]) print(basename) with open(fname, 'r') as fh: print('Reading %s' % fname) txt = fh.read() txt = preprocess_cwms(txt) # Get paragraphs paras = get_paragraphs(txt) print('Reading %d paragraphs' % len(paras)) for i, para in enumerate(paras): sentences = tokenize.sent_tokenize(para) bl = 20 blocks = [ sentences[i * bl:i * bl + bl] for i in range(math.ceil(len(sentences) / bl)) ] print('Reading %d blocks' % len(blocks)) for j, block in enumerate(blocks): block_txt = ' '.join(block) if len(blocks) == 1: ekb_fname = basename + '_%d.ekb' % i else: ekb_fname = basename + '_%d_%d.ekb' % (i, j) if os.path.exists(ekb_fname): with open(ekb_fname, 'r') as fh: cp = cwms.process_ekb(fh.read()) elif read: cp = cwms.process_text(block_txt, save_xml=ekb_fname) else: continue print('%d stmts from %s (%d/%d)' % (len(cp.statements), fname, i, j)) stmts += cp.statements return stmts