示例#1
0
def test_lexmap_basic():
    """
    Text lexical mapping
    """
    factory = OntologyFactory()
    print("Creating ont")
    ont = factory.create('tests/resources/lexmap_test.json')
    lexmap = LexicalMapEngine()
    lexmap.index_ontology(ont)

    print(lexmap.lmap)
    print(ont.all_synonyms())
    g = lexmap.get_xref_graph()
    for x, y, d in g.edges_iter(data=True):
        print("{}<->{} :: {}".format(x, y, d))
    for x in g.nodes():
        print("{} --> {}".format(x, lexmap.grouped_mappings(x)))
    assert g.has_edge('Z:2', 'ZZ:2')  # roman numerals
    assert g.has_edge('Z:2', 'Y:2')  # case insensitivity
    assert g.has_edge('A:1', 'B:1')  # synonyms
    assert g.has_edge('B:1', 'A:1')  # bidirectional
    for x, y, d in g.edges_iter(data=True):
        print("{}<->{} :: {}".format(x, y, d))
        cpr = d[lexmap.CONDITIONAL_PR]
        assert cpr > 0 and cpr <= 1.0

    df = lexmap.as_dataframe(g)
    print(df.to_csv(sep="\t"))

    lexmap = LexicalMapEngine(
        config=dict(synsets=[dict(word="", synonym="ignoreme", weight=-2.0)],
                    normalized_form_confidence=0.25,
                    abbreviation_confidence=0.5,
                    meaningful_ids=True,
                    ontology_configurations=[
                        dict(prefix='AA', normalized_form_confidence=-1000)
                    ]))

    assert len(lexmap._get_config_val('NULL', 'synsets')) == 1
    assert lexmap._normalize_label('ignoreme foo', {'ignoreme': ''}) == 'foo'
    assert lexmap._normalize_label('replaceme foo',
                                   {'replaceme': 'zz'}) == 'foo zz'

    ont.add_node('TEST:1', 'foo bar')
    ont.add_node('TEST:2', 'bar foo')
    ont.add_node('TEST:3', 'foo bar')
    ont.add_node('TEST:4', 'wiz')
    syn = Synonym('TEST:4', val='bar foo', pred='hasRelatedSynonym')
    ont.add_synonym(syn)
    ont.add_node('http://x.org/wiz#FooBar')
    ont.add_node('TEST:6', '123')
    ont.add_node('TEST:7', '123')
    ont.add_node('TEST:8', 'bar ignoreme foo')
    ont.add_node('AA:1', 'foo bar')
    ont.add_node('AA:2', 'bar foo')
    ont.add_node('ABBREV:1', 'ABCD')
    ont.add_node('ABBREV:2', 'ABCD')
    for s in ont.synonyms('TEST:4'):
        print('S={}'.format(s))
    lexmap.index_ontology(ont)
    g = lexmap.get_xref_graph()
    for x, d in g['TEST:1'].items():
        print('XREF: {} = {}'.format(x, d))
    assert g.has_edge('TEST:1', 'TEST:2')  # normalized
    logging.info('E 1-2 = {}'.format(g['TEST:1']['TEST:2']))
    assert int(g['TEST:1']['TEST:2']['score']) == 25
    assert int(g['TEST:1']['TEST:3']['score']) == 100
    assert int(g['TEST:1']['TEST:4']['score']) < 25
    assert g.has_edge('TEST:3', 'http://x.org/wiz#FooBar')  # IDs and CamelCase
    assert not g.has_edge('TEST:6',
                          'TEST:7')  # should omit syns with no alphanumeric

    # test exclude normalized form
    assert not g.has_edge('AA:1', 'AA:2')

    # test custom synsets are used
    assert g.has_edge('TEST:8', 'TEST:2')
    assert g.has_edge('TEST:8', 'AA:2')
    assert not g.has_edge('TEST:8', 'AA:1')  # do not normalize AAs

    assert lexmap.smap['ABBREV:1'][0].is_abbreviation()
    assert lexmap.smap['ABBREV:2'][0].is_abbreviation()
    assert g.has_edge('ABBREV:1', 'ABBREV:2')
    assert int(g['ABBREV:1']['ABBREV:2']['score']) == 25

    df = lexmap.unmapped_dataframe(g)
    print(df.to_csv())
示例#2
0
def main():
    """
    Wrapper for OGR
    """
    parser = argparse.ArgumentParser(description='Wrapper for ontobio lexical mapping'
                                                 """
                                                 Lexically maps one or more ontologies. Ontologies can be local or remote,
                                                 any input handle can be specified, see docs for more details on handles.

                                                 If multiple ontologies are specified, then each ontology in the list is compared against the first one.

                                                 If a simgle ontology is specified, then all pairs in that ontology will be compared
                                                 
                                                 Output format to be documented - see lexmap.py for the various scoring attributes for now.
                                                 """,
                                     formatter_class=argparse.RawTextHelpFormatter)

    parser.add_argument('-o', '--outfile', type=str, nargs='*', default=[], required=False,
                        help='Path to output file')
    parser.add_argument('-t', '--to', type=str, required=False, default='tsv',
                        help='Output to (tree, dot, ...)')
    parser.add_argument('-l', '--labels', type=str,
                        help='If set, then include node labels in results. DEPRECATED')
    parser.add_argument('-s', '--scoring', default='sim', type=str,
                        help='Score weighting scheme. Default=sim')
    parser.add_argument('-P', '--prefix', type=str, required=False,
                        help='Prefix to constrain traversal on, e.g. PATO, ENVO')
    parser.add_argument('-c', '--config', type=str, required=False,
                        help='lexmap configuration file (yaml). See schema for details')
    parser.add_argument('-X', '--xref_weights', type=str, required=False,
                        help='csv of curated per-xref weights')
    parser.add_argument('-u', '--unmapped', type=str, required=False,
                        help='File to export unmapped nodes to')
    parser.add_argument('-A', '--all-by-all', dest='all_by_all', action='store_true',
                        help='compare all ontologies against all.')
    parser.add_argument('-v', '--verbosity', default=0, action='count',
                        help='Increase output verbosity')

    parser.add_argument('ontologies',nargs='*',
                        help='one or more ontologies to be aligned. Any input handle can be specified')

    args = parser.parse_args()

    if args.verbosity >= 2:
        logging.basicConfig(level=logging.DEBUG)
    elif args.verbosity == 1:
        logging.basicConfig(level=logging.INFO)
    else:
        logging.basicConfig(level=logging.WARNING)
        
    logging.info("Welcome!")

    factory = OntologyFactory()
    onts = [filter_by_prefix(factory.create(h)) for h in args.ontologies]

    
    config = {}
    if args.config is not None:
        f = open(args.config,'r')
        config = yaml.load(f)
        f.close()

    if args.xref_weights is not None:
        if 'xref_weights' not in config:
            config['xref_weights'] = []
        xws = config['xref_weights']
        df = pd.read_csv(args.xref_weights)
        df = df.fillna(0.0)
        for _, row in df.iterrows():
            w = float(row['weight'])
            WA = np.array((0.0, 0.0, 0.0, 0.0))
            if w < 0:
                WA[2] = w
                WA[3] = abs(w)
            else:
                WA[2] = w
                WA[3] = -w
            xws.append({'left':row['left'],
                        'right':row['right'],
                        'weights':WA})
        
    logging.info("ALL: {}".format(args.all_by_all))
    
    lexmap = LexicalMapEngine(config=config)
    if len(onts) == 0:
        raise ValueException("must pass one or more ontologies")
    else:
        logging.info("Indexing ontologies: {}".format(onts))
        for ont in onts:
            lexmap.index_ontology(ont)
        oid0 = onts[0].id
        pairs = [(oid0,oid0)]
        if len(onts) > 1:
            if args.all_by_all:
                logging.info("All vs ALL: {}".format(onts))
                pairs = []
                for i in onts:
                    for j in onts:
                        if i.id < j.id:
                            pairs.append((i.id, j.id))
            else:
                logging.info("All vs first in list: {}".format(oid0))
                pairs = [(oid0, ont.id) for ont in onts[1:]]
        logging.info("Comparing the following pairs of ontologies: {}".format(pairs))
        lexmap.ontology_pairs = pairs
    mo = Ontology()
    mo.merge(onts)
    
    g = lexmap.get_xref_graph()
    
    if args.to == 'obo':
        write_obo(g,mo,args)
    else:
        write_tsv(lexmap,g,mo,args)

        
    if args.unmapped:
        udf = lexmap.unmapped_dataframe(g)
        udf.to_csv(args.unmapped, sep="\t", index=False)