Exemplo n.º 1
0
def load_hume(cached=True):
    logger.info('Loading Hume statements')
    pkl_name = os.path.join(data_path, 'hume', 'stmts_influence.pkl')
    if cached:
        if os.path.exists(pkl_name):
            with open(pkl_name, 'rb') as fh:
                stmts = pickle.load(fh)
                logger.info(f'Loaded {len(stmts)} statements')
                return stmts
    fnames = glob.glob(
        os.path.join(data_path, 'hume', 'wm_dart.101119.121619', '*.json-ld'))
    fnames += glob.glob(
        os.path.join(data_path, 'hume', 'wm_factiva.121019.121619',
                     '*.json-ld'))
    fnames += glob.glob(
        os.path.join(data_path, 'hume', 'wm_luma.121019.121619', '*.json-ld'))

    stmts = []
    for fname in tqdm.tqdm(fnames):
        hp = hume.process_jsonld_file(fname)
        stmts += hp.statements
    logger.info(f'Loaded {len(stmts)} statements from Hume')
    with open(pkl_name, 'wb') as fh:
        pickle.dump(stmts, fh)
    return stmts
Exemplo n.º 2
0
def load_hume():
    logger.info('Loading Hume statements')
    fnames = glob.glob(
        os.path.join(data_path, 'hume/wm_dart.082919.v3.json-ld'))

    stmts = []
    for fname in fnames:
        hp = hume.process_jsonld_file(fname)
        stmts += hp.statements
    logger.info(f'Loaded {len(stmts)} statements from Hume')
    return stmts
Exemplo n.º 3
0
def read_hume(fname, version='new'):
    if version == 'new':
        bp = hume.process_jsonld_file(fname)
        # Remap doc names - only needed if original doc names should be
        # reconstructed
        # for stmt in bp.statements:
        #     stmt.evidence[0].pmid = stmt.evidence[0].pmid[:-4]
    else:
        bp = hume.process_json_file_old(fname)
    # We need to filter out a duplicate document to avoid
    # artifactual duplicates
    ret_stmts = []
    for stmt in bp.statements:
        doc = stmt.evidence[0].annotations['provenance'][0]['document']['@id']
        if doc != 'ENG_NW_20171205':
            ret_stmts.append(stmt)
    return ret_stmts
Exemplo n.º 4
0
def load_hume(cached=True):
    logger.info('Loading Hume statements')
    pkl_name = os.path.join(data_path, 'hume', 'stmts_intervention.pkl')
    if cached:
        if os.path.exists(pkl_name):
            with open(pkl_name, 'rb') as fh:
                stmts = pickle.load(fh)
                logger.info(f'Loaded {len(stmts)} statements')
                return stmts
    fnames = glob.glob(
        os.path.join(data_path, 'hume', 'wm_thanksgiving_intervention.030920',
                     '*.json-ld'))
    stmts = []
    for fname in tqdm.tqdm(fnames):
        hp = hume.process_jsonld_file(fname)
        stmts += hp.statements
    logger.info(f'Loaded {len(stmts)} statements from Hume')
    with open(pkl_name, 'wb') as fh:
        pickle.dump(stmts, fh)
    return stmts
Exemplo n.º 5
0
def process_hume():
    print('Processing Hume output')
    #path = 'docs/hume/wm_m12.v8.full.v4.json-ld'
    path = 'docs/hume/wm_m12.v11.500doc.after.json-ld'
    hp = hume.process_jsonld_file(path)
    return hp.statements
Exemplo n.º 6
0
 readers = ['sofia', 'eidos', 'hume', 'cwms']
 grounding = 'compositional'
 do_upload = False
 stmts = []
 for reader in readers:
     version = reader_versions[grounding][reader]
     pattern = '*' if reader != 'sofia' \
         else ('*_new' if grounding == 'compositional' else '*_old')
     fnames = glob.glob('/Users/ben/data/dart/%s/%s/%s' %
                        (reader, version, pattern))
     print('Found %d files for %s' % (len(fnames), reader))
     for fname in tqdm.tqdm(fnames):
         if reader == 'eidos':
             pp = eidos.process_json_file(fname, grounding_mode=grounding)
         elif reader == 'hume':
             pp = hume.process_jsonld_file(fname, grounding_mode=grounding)
         elif reader == 'cwms':
             pp = cwms.process_ekb_file(fname, grounding_mode=grounding)
         elif reader == 'sofia':
             pp = sofia.process_json_file(fname, grounding_mode=grounding)
         doc_id = os.path.basename(fname)[:32]
         for stmt in pp.statements:
             for ev in stmt.evidence:
                 if 'provenance' not in ev.annotations:
                     ev.annotations['provenance'] = [{
                         'document': {
                             '@id': doc_id
                         }
                     }]
                 else:
                     prov = ev.annotations['provenance'][0]['document']
Exemplo n.º 7
0
def do_regrounding(stmts):
    concepts = []
    for stmt in stmts:
        for concept in stmt.agent_list():
            concept_txt = concept.db_refs.get('TEXT')
            concepts.append(concept_txt)
    groundings = er.reground_texts(concepts)
    # Update the corpus with new groundings
    idx = 0
    for stmt in stmts:
        for concept in stmt.agent_list():
            concept.db_refs['UN'] = groundings[idx]
            idx += 1
    return stmts


if __name__ == '__main__':
    config = load_config()
    fnames = config['files']
    for fname in fnames:
        print('Processing %s' % fname)
        hp = hume.process_jsonld_file(fname)
        parts = fname.split('/')
        new_fname = '%s_%s' % (parts[-2], parts[-1])
        new_fname = new_fname.replace('json-ld', 'json')
        print('Running regrounding')
        stmts = do_regrounding(hp.statements)
        print('Savig into JSON')
        stmts_to_json_file(hp.statements, new_fname)