def add_features_generic(mention_id, pheno_words, sentence):
    # Use the generic feature library (ONLY!)

    # Load dictionaries for keywords
    ddlib.load_dictionary(BASE_DIR + "/dicts/features/pheno_var.tsv",  "VARKW")
    ddlib.load_dictionary(
        BASE_DIR + "/dicts/features/pheno_patient.tsv",  "PATIENTKW")

    # Create the objects used by ddlib. ddlib interface is so ugly.
    obj = dict()
    obj['lemma'] = []
    obj['words'] = []
    obj['ner'] = []
    obj['pos'] = []
    obj['dep_graph'] = []
    for word in sentence.words:
        obj['lemma'].append(word.lemma)
        obj['words'].append(word.word)
        obj['ner'].append(word.ner)
        obj['pos'].append(word.pos)
        obj['dep_graph'].append(
            str(word.dep_parent + 1) + "\t" + word.dep_path + "\t" +
            str(word.in_sent_idx + 1))
    word_obj_list = ddlib.unpack_words(
        obj, lemma='lemma', pos='pos', ner='ner', words='words',
        dep_graph='dep_graph', dep_graph_parser=ddlib.dep_graph_parser_triplet)
    gene_span = ddlib.get_span(pheno_words[0].in_sent_idx, len(pheno_words))
    features = set()
    for feature in ddlib.get_generic_features_mention(
            word_obj_list, gene_span):
        features.add(feature)
    for feature in features:
        print_feature(sentence.doc_id, mention_id, feature)
Пример #2
0
def run(doc_id, sent_id, words, lemmas, poses, ners, dep_paths, dep_parents, wordidxs, relation_id, wordidxs_1, wordidxs_2):
  try:
    import ddlib
  except:
    import os
    DD_HOME = os.environ['DEEPDIVE_HOME']
    from sys import path
    path.append('%s/ddlib' % DD_HOME)
    import ddlib

  obj = dict()
  obj['lemma'] = []
  obj['words'] = []
  obj['ner'] = []
  obj['pos'] = []
  obj['dep_graph'] = []
  for i in xrange(len(words)):
      obj['lemma'].append(lemmas[i])
      obj['words'].append(words[i])
      obj['ner'].append(ners[i])
      obj['pos'].append(poses[i])
      obj['dep_graph'].append(
          str(int(dep_parents[i])) + "\t" + dep_paths[i] + "\t" + str(i))
  word_obj_list = ddlib.unpack_words(
      obj, lemma='lemma', pos='pos', ner='ner', words='words', dep_graph='dep_graph')
  gene_span = ddlib.get_span(wordidxs_1[0], len(wordidxs_1))
  pheno_span = ddlib.get_span(wordidxs_2[0], len(wordidxs_2))
  features = set()
  for feature in ddlib.get_generic_features_relation(word_obj_list, gene_span, pheno_span):
    features.add(feature)
  for feature in features:
    yield doc_id, relation_id, feature
def add_features_generic(relation_id, gene_words, pheno_words, sentence):
    # Use the generic feature library (ONLY!)

    obj = dict()
    obj['lemma'] = []
    obj['words'] = []
    obj['ner'] = []
    obj['pos'] = []
    obj['dep_graph'] = []
    for word in sentence.words:
        obj['lemma'].append(word.lemma)
        obj['words'].append(word.word)
        obj['ner'].append(word.ner)
        obj['pos'].append(word.pos)
        obj['dep_graph'].append(
            str(word.dep_parent + 1) + "\t" + word.dep_path + "\t" +
            str(word.in_sent_idx + 1))
    word_obj_list = ddlib.unpack_words(
        obj,
        lemma='lemma',
        pos='pos',
        ner='ner',
        words='words',
        dep_graph='dep_graph',
        dep_graph_parser=ddlib.dep_graph_parser_triplet)
    gene_span = ddlib.get_span(gene_words[0].in_sent_idx, len(gene_words))
    pheno_span = ddlib.get_span(pheno_words[0].ins_sent_idx, len(pheno_words))
    features = set()
    for feature in ddlib.get_generic_feature_relation(word_obj_list, gene_span,
                                                      pheno_span):
        features.add(feature)
    for feature in features:
        print_feature(sentence.doc_id, relation_id, feature)
def add_features_generic(mention_id, gene_words, sentence):
    # Use the generic feature library (ONLY!)

    # Load dictionaries for keywords
    ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_var.tsv", "VARKW")
    ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_knock.tsv",
                          "KNOCKKW")
    ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_amino.tsv",
                          "AMINOKW")
    ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_antigene.tsv",
                          "ANTIGENEKW")
    ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_dna.tsv", "DNAKW")
    ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_downregulation.tsv",
                          "DOWNREGKW")
    ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_upregulation.tsv",
                          "UPREGKW")
    ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_tumor.tsv",
                          "TUMORKW")
    ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_gene.tsv", "GENEKW")
    ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_expression.tsv",
                          "EXPRESSKW")
    # Create the objects used by ddlib. ddlib interface is so ugly.
    obj = dict()
    obj['lemma'] = []
    obj['words'] = []
    obj['ner'] = []
    obj['pos'] = []
    obj['dep_graph'] = []
    for word in sentence.words:
        obj['lemma'].append(word.lemma)
        obj['words'].append(word.word)
        obj['ner'].append(word.ner)
        obj['pos'].append(word.pos)
        obj['dep_graph'].append(
            str(word.dep_parent + 1) + "\t" + word.dep_path + "\t" +
            str(word.in_sent_idx + 1))
    word_obj_list = ddlib.unpack_words(
        obj,
        lemma='lemma',
        pos='pos',
        ner='ner',
        words='words',
        dep_graph='dep_graph',
        dep_graph_parser=ddlib.dep_graph_parser_triplet)
    gene_span = ddlib.get_span(gene_words[0].in_sent_idx, len(gene_words))
    features = set()
    for feature in ddlib.get_generic_features_mention(word_obj_list,
                                                      gene_span):
        features.add(feature)
    for feature in features:
        print_feature(sentence.doc_id, mention_id, feature)
Пример #5
0
def add_features_generic(mention_id, gene_words, sentence):
    # Use the generic feature library (ONLY!)

    # Load dictionaries for keywords
    ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_var.tsv",  "VARKW")
    ddlib.load_dictionary(
        BASE_DIR + "/dicts/features/gene_knock.tsv",  "KNOCKKW")
    ddlib.load_dictionary(
        BASE_DIR + "/dicts/features/gene_amino.tsv",  "AMINOKW")
    ddlib.load_dictionary(
        BASE_DIR + "/dicts/features/gene_antigene.tsv",  "ANTIGENEKW")
    ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_dna.tsv",  "DNAKW")
    ddlib.load_dictionary(
        BASE_DIR + "/dicts/features/gene_downregulation.tsv",  "DOWNREGKW")
    ddlib.load_dictionary(
        BASE_DIR + "/dicts/features/gene_upregulation.tsv",  "UPREGKW")
    ddlib.load_dictionary(
        BASE_DIR + "/dicts/features/gene_tumor.tsv",  "TUMORKW")
    ddlib.load_dictionary(
        BASE_DIR + "/dicts/features/gene_gene.tsv",  "GENEKW")
    ddlib.load_dictionary(
        BASE_DIR + "/dicts/features/gene_expression.tsv",  "EXPRESSKW")
    # Create the objects used by ddlib. ddlib interface is so ugly.
    obj = dict()
    obj['lemma'] = []
    obj['words'] = []
    obj['ner'] = []
    obj['pos'] = []
    obj['dep_graph'] = []
    for word in sentence.words:
        obj['lemma'].append(word.lemma)
        obj['words'].append(word.word)
        obj['ner'].append(word.ner)
        obj['pos'].append(word.pos)
        obj['dep_graph'].append(
            str(word.dep_parent + 1) + "\t" + word.dep_path + "\t" +
            str(word.in_sent_idx + 1))
    word_obj_list = ddlib.unpack_words(
        obj, lemma='lemma', pos='pos', ner='ner', words='words',
        dep_graph='dep_graph', dep_graph_parser=ddlib.dep_graph_parser_triplet)
    gene_span = ddlib.get_span(gene_words[0].in_sent_idx, len(gene_words))
    features = set()
    for feature in ddlib.get_generic_features_mention(
            word_obj_list, gene_span):
        features.add(feature)
    for feature in features:
        print_feature(sentence.doc_id, mention_id, feature)
Пример #6
0
def run(doc_id, sent_id, words, lemmas, poses, ners, dep_paths, dep_parents,
        wordidxs, relation_id, wordidxs_1, wordidxs_2):
    try:
        import ddlib
    except:
        import os
        DD_HOME = os.environ['DEEPDIVE_HOME']
        from sys import path
        path.append('%s/ddlib' % DD_HOME)
        import ddlib

    obj = dict()
    obj['lemma'] = []
    obj['words'] = []
    obj['ner'] = []
    obj['pos'] = []
    obj['dep_graph'] = []
    for i in xrange(len(words)):
        obj['lemma'].append(lemmas[i])
        obj['words'].append(words[i])
        obj['ner'].append(ners[i])
        obj['pos'].append(poses[i])
        obj['dep_graph'].append(
            str(int(dep_parents[i])) + "\t" + dep_paths[i] + "\t" + str(i))
    word_obj_list = ddlib.unpack_words(obj,
                                       lemma='lemma',
                                       pos='pos',
                                       ner='ner',
                                       words='words',
                                       dep_graph='dep_graph')
    gene_span = ddlib.get_span(wordidxs_1[0], len(wordidxs_1))
    pheno_span = ddlib.get_span(wordidxs_2[0], len(wordidxs_2))
    features = set()
    for feature in ddlib.get_generic_features_relation(word_obj_list,
                                                       gene_span, pheno_span):
        features.add(feature)
    for feature in features:
        yield doc_id, relation_id, feature
Пример #7
0
#! /usr/bin/env python
# File: udf/ext_has_spouse_features.py

import sys, json
import ddlib


def my_dep_format_parser(s):
        parent, label, child = s.split('\t')
        return (int(parent)-1, label, int(child)-1)

for row in sys.stdin:

        obj = json.loads(row)

        words = list(ddlib.unpack_words(obj, character_offset_begin='character_offset_begin',
                character_offset_end='character_offset_end', lemma='lemma',
                pos='pos', words = 'words', dep_graph = 'dep_graph', dep_graph_parser=my_dep_format_parser))

        edges = ddlib.dep_path_between_words(words, 0, len(words)-1)

        for e in edges:
                print("%s %s" % (e.word1.lemma, e.word2.lemma))

Пример #8
0
#! /usr/bin/env python
# File: udf/ext_has_spouse_features.py

import sys, json
import ddlib


def my_dep_format_parser(s):
    parent, label, child = s.split('\t')
    return (int(parent) - 1, label, int(child) - 1)


for row in sys.stdin:

    obj = json.loads(row)

    words = list(
        ddlib.unpack_words(obj,
                           character_offset_begin='character_offset_begin',
                           character_offset_end='character_offset_end',
                           lemma='lemma',
                           pos='pos',
                           words='words',
                           dep_graph='dep_graph',
                           dep_graph_parser=my_dep_format_parser))

    edges = ddlib.dep_path_between_words(words, 0, len(words) - 1)

    for e in edges:
        print("%s %s" % (e.word1.lemma, e.word2.lemma))