예제 #1
0
def read_cwms_sentences(text_dict, read=True):
    """Read specific sentences with CWMS."""
    bl = 20
    stmts = []
    for doc, sentences in text_dict.items():
        blocks = [
            sentences[i * bl:i * bl + bl]
            for i in range(math.ceil(len(sentences) / bl))
        ]
        for j, block in enumerate(blocks):
            block_txt = '.\n'.join(t.capitalize() for t in block)
            block_txt = preprocess_cwms(block_txt)
            if len(blocks) == 1:
                ekb_fname = 'cwms/%s_sentences.ekb' % doc
            else:
                ekb_fname = 'cwms/%s_sentences_%d.ekb' % (doc, j)
            if os.path.exists(ekb_fname):
                with open(ekb_fname, 'r') as fh:
                    cp = cwms.process_ekb(fh.read())
            elif read:
                print('Reading into %s' % ekb_fname)
                cp = cwms.process_text(block_txt, save_xml=ekb_fname)
            else:
                continue
            #print('%d stmts from %s %d' %
            #      (len(cp.statements), ekb_fname, j))
            stmts += cp.statements
            # Set the PMID on these statements so that we can get the document ID
            # during assembly
            for stmt in cp.statements:
                stmt.evidence[0].pmid = doc
    return stmts
예제 #2
0
def read_cwms_ekbs(docnames):
    stmts = []
    for docname in docnames:
        for fname in glob.glob('cwms/%s_sentences*.ekb' % docname):
            with open(fname, 'r') as fh:
                cp = cwms.process_ekb(fh.read())
                stmts += cp.statements
    return stmts
예제 #3
0
def read_cwms_full(fnames, read=True):
    """Read full texts with CWMS."""
    def get_paragraphs(txt):
        # Break up along blank lines
        parts = txt.split('\n\n')
        # Consider a part a paragraph if it's at least 3 lines long
        paras = [p for p in parts if len(p.split('\n')) >= 3]
        return paras

    stmts = []
    for fname in fnames:
        basename = '.'.join(fname.split('.')[:-1])
        print(basename)
        with open(fname, 'r') as fh:
            print('Reading %s' % fname)
            txt = fh.read()
        txt = preprocess_cwms(txt)
        # Get paragraphs
        paras = get_paragraphs(txt)
        print('Reading %d paragraphs' % len(paras))
        for i, para in enumerate(paras):
            sentences = tokenize.sent_tokenize(para)
            bl = 20
            blocks = [
                sentences[i * bl:i * bl + bl]
                for i in range(math.ceil(len(sentences) / bl))
            ]
            print('Reading %d blocks' % len(blocks))
            for j, block in enumerate(blocks):
                block_txt = ' '.join(block)
                if len(blocks) == 1:
                    ekb_fname = basename + '_%d.ekb' % i
                else:
                    ekb_fname = basename + '_%d_%d.ekb' % (i, j)
                if os.path.exists(ekb_fname):
                    with open(ekb_fname, 'r') as fh:
                        cp = cwms.process_ekb(fh.read())
                elif read:
                    cp = cwms.process_text(block_txt, save_xml=ekb_fname)
                else:
                    continue
                print('%d stmts from %s (%d/%d)' %
                      (len(cp.statements), fname, i, j))
                stmts += cp.statements
    return stmts