def extract(
    mention_id="text",
    doc_begin_index="int",
    doc_end_index="int",
    doc_id="text",
    position="text",
    sentence_index="int",
    tokens="text[]",
    pos_tags="text[]",
):
    # Constant
    # WINDOW_SIZE = 10

    # Load keyword dictionaries using ddlib, for domain-specific features
    # Words in "legal_penalty" dictionary are indicative of marriage
    # Words in "non_legal_penalty" dictionary are indicative of non_marriage
    APP_HOME = os.environ['APP_HOME']
    ddlib.load_dictionary(APP_HOME + "/udf/dicts/kw_crime.txt",
                          dict_id="crime")
    ddlib.load_dictionary(APP_HOME + "/udf/dicts/kw_non_crime.txt",
                          dict_id="non_crime")

    # kw_non_legal_penalty = map(lambda word: word.strip(), open(APP_HOME + "/udf/dicts/kw_non_legal_penalty.txt", 'r').readlines())
    # kw_legal_penalty = map(lambda word: word.strip(), open(APP_HOME + "/udf/dicts/kw_legal_penalty.txt", 'r').readlines())
    # Non penalty signals on the left of candidate mention
    # NON_PENAL_SIGNALS_LEFT = frozenset(kw_non_legal_penalty)
    # Penalty signals on the right of candidate mention
    # PENAL_SIGNALS_LEFT = frozenset(kw_legal_penalty)

    WINDOW_SIZE = 10
    MAX_PHRASE_LENGTH = 5

    # Get all subsequences of left sentence with WINDOW_SIZE = 10
    low_tokens = map(lambda token: token.lower(), tokens)
    left_window = get_left_window(doc_begin_index, low_tokens, WINDOW_SIZE)
    phrases_in_sentence_left = list(
        get_all_phrases_in_sentence(left_window, MAX_PHRASE_LENGTH))

    # Create a DDLIB sentence object, which is just a list of DDLIB Word objects
    sent = []
    for i, t in enumerate(tokens):
        sent.append(
            ddlib.Word(
                begin_char_offset=None,
                end_char_offset=None,
                word=t,
                lemma=tokens[i],  # lemma for vietnamese: lowercase
                pos=pos_tags[i],
                ner=None,
                dep_par=
                -1,  # Note that as stored from CoreNLP 0 is ROOT, but for DDLIB -1 is ROOT
                dep_label=None))

    # Create DDLIB Span for penalty candidate
    penalty_span = ddlib.Span(begin_word_id=doc_begin_index,
                              length=(doc_end_index - doc_begin_index + 1))

    # Generate the generic features using DDLIB on left and right window
    for feature in ddlib.get_generic_features_mention(sent, penalty_span):
        yield [mention_id, feature]
def add_features_generic(mention_id, pheno_words, sentence):
    # Use the generic feature library (ONLY!)

    # Load dictionaries for keywords
    ddlib.load_dictionary(BASE_DIR + "/dicts/features/pheno_var.tsv",  "VARKW")
    ddlib.load_dictionary(
        BASE_DIR + "/dicts/features/pheno_patient.tsv",  "PATIENTKW")

    # Create the objects used by ddlib. ddlib interface is so ugly.
    obj = dict()
    obj['lemma'] = []
    obj['words'] = []
    obj['ner'] = []
    obj['pos'] = []
    obj['dep_graph'] = []
    for word in sentence.words:
        obj['lemma'].append(word.lemma)
        obj['words'].append(word.word)
        obj['ner'].append(word.ner)
        obj['pos'].append(word.pos)
        obj['dep_graph'].append(
            str(word.dep_parent + 1) + "\t" + word.dep_path + "\t" +
            str(word.in_sent_idx + 1))
    word_obj_list = ddlib.unpack_words(
        obj, lemma='lemma', pos='pos', ner='ner', words='words',
        dep_graph='dep_graph', dep_graph_parser=ddlib.dep_graph_parser_triplet)
    gene_span = ddlib.get_span(pheno_words[0].in_sent_idx, len(pheno_words))
    features = set()
    for feature in ddlib.get_generic_features_mention(
            word_obj_list, gene_span):
        features.add(feature)
    for feature in features:
        print_feature(sentence.doc_id, mention_id, feature)
def extract(
    p_id="text",
    e_id="text",
    p_begin_index="int",
    p_end_index="int",
    e_begin_index="int",
    e_end_index="int",
    doc_id="text",
    sent_index="int",
    tokens="text[]",
    lemmas="text[]",
    pos_tags="text[]",
    ner_tags="text[]",
    dep_types="text[]",
    dep_parents="int[]",
):
    """
    Uses DDLIB to generate features for the spouse relation.
    """
    ddlib.load_dictionary(os.path.abspath("../../../job_employ_keyword.txt"),
                          dict_id="has_employment")
    ddlib.load_dictionary(
        os.path.abspath("../../../job_no_employ_keyword.txt"),
        dict_id="no_employment")
    # Create a DDLIB sentence object, which is just a list of DDLIB Word objects
    sent = []
    for i, t in enumerate(tokens):
        sent.append(
            ddlib.Word(
                begin_char_offset=None,
                end_char_offset=None,
                word=t,
                lemma=lemmas[i],
                pos=pos_tags[i],
                ner=ner_tags[i],
                dep_par=dep_parents[i] -
                1,  # Note that as stored from CoreNLP 0 is ROOT, but for DDLIB -1 is ROOT
                dep_label=dep_types[i]))

    # Create DDLIB Spans for the two mentions
    p_span = ddlib.Span(begin_word_id=p_begin_index,
                        length=(p_end_index - p_begin_index + 1))
    e_span = ddlib.Span(begin_word_id=e_begin_index,
                        length=(e_end_index - e_begin_index + 1))

    # Generate the generic features using DDLIB
    for feature in ddlib.get_generic_features_relation(sent, p_span, e_span):
        yield [p_id, e_id, feature]
Exemplo n.º 4
0
    dds = util.create_ddlib_sentence(row)

    # (1) GENERIC FEATURES from ddlib
    span = ddlib.Span(begin_word_id=row.mention_wordidxs[0],
                      length=len(row.mention_wordidxs))
    features += [(row.doc_id, row.section_id, row.mention_id, feat) \
                      for feat in ddlib.get_generic_features_mention(dds, span)]

    # (2) Add the closest verb by raw distance
    if OPTS.get('closest-verb'):
        verb_idxs = [i for i, p in enumerate(row.poses) if p.startswith("VB")]
        if len(verb_idxs) > 0:
            dists = filter(lambda d : d[0] > 0, \
                           [(min([abs(i-j) for j in row.mention_wordidxs]), i) for i in verb_idxs])
            if len(dists) > 0:
                verb = row.lemmas[min(dists)[1]]
                features.append((row.doc_id, row.section_id, row.mention_id,
                                 'NEAREST_VERB_[%s]' % (verb, )))
    return features


# Load in manually defined keywords
onto_path = lambda p: '%s/onto/%s' % (os.environ['GDD_HOME'], p)

if __name__ == '__main__':
    if OPTS.get('sentence-kws'):
        ddlib.load_dictionary(onto_path('manual/pheno_sentence_keywords.tsv'),
                              dict_id='pheno_kws')
    util.run_main_tsv(row_parser=parser.parse_tsv_row,
                      row_fn=get_features_for_candidate)
Exemplo n.º 5
0
#! /usr/bin/env python

import sys, os
import ddlib     # DeepDive python utility

ARR_DELIM = '~^~'

# Load keyword dictionaries using ddlib, for domain-specific features
# Words in "married" dictionary are indicative of marriage
# Words in "non_married" dictionary are indicative of non_marriage
BASE_DIR = os.path.dirname(os.path.realpath(__file__))
ddlib.load_dictionary(BASE_DIR + "/dicts/married.txt", dict_id="married")
ddlib.load_dictionary(BASE_DIR + "/dicts/non_married.txt", dict_id="non_married")

# For each input tuple
for row in sys.stdin:
  parts = row.strip().split('\t')

  # Get all fields from a row
  words = parts[0].split(ARR_DELIM)
  lemmas = parts[1].split(ARR_DELIM)
  poses = parts[2].split(ARR_DELIM)
  dependencies = parts[3].split(ARR_DELIM)
  ners = parts[4].split(ARR_DELIM)
  relation_id = parts[5]
  p1_start, p1_length, p2_start, p2_length = [int(x) for x in parts[6:]]

  # Get a sentence from ddlib -- array of "Word" objects
  if len(dependencies) == 0:
    print >>sys.stderr, str(relation_id) + '\t' + 'DEP_PATH_EMPTY'
    continue
Exemplo n.º 6
0
def extract(
    p_id="text",
    p_begin_index="int",
    p_end_index="int",
    doc_id="text",
    sent_index="int",
    tokens="text[]",
    pos_tags="text[]",
    ner_tags="text[]",
    dep_types="text[]",
    dep_parents="int[]",
):
    """
    Uses DDLIB to generate features for the legal penalty mention
    """
    # Constant
    # WINDOW_SIZE = 10

    # Load keyword dictionaries using ddlib, for domain-specific features
    # Words in "legal_penalty" dictionary are indicative of marriage
    # Words in "non_legal_penalty" dictionary are indicative of non_marriage
    APP_HOME = os.environ['APP_HOME']
    ddlib.load_dictionary(APP_HOME + "/udf/dicts/kw_legal_penalty.txt",
                          dict_id="legal_penalty")
    ddlib.load_dictionary(APP_HOME + "/udf/dicts/kw_non_legal_penalty.txt",
                          dict_id="non_legal_penalty")

    kw_non_legal_penalty = map(
        lambda word: word.strip(),
        open(APP_HOME + "/udf/dicts/kw_non_legal_penalty.txt",
             'r').readlines())
    # kw_legal_penalty = map(lambda word: word.strip(), open(APP_HOME + "/udf/dicts/kw_legal_penalty.txt", 'r').readlines())
    # Non penalty signals on the left of candidate mention
    NON_PENAL_SIGNALS_LEFT = frozenset(kw_non_legal_penalty)
    # Penalty signals on the right of candidate mention
    # PENAL_SIGNALS_LEFT = frozenset(kw_legal_penalty)

    WINDOW_SIZE = 10
    MAX_PHRASE_LENGTH = 5

    # Get all subsequences of left sentence with WINDOW_SIZE = 10
    low_tokens = map(lambda token: token.lower(), tokens)
    left_window = get_left_window(p_begin_index, low_tokens, WINDOW_SIZE)
    phrases_in_sentence_left = list(
        get_all_phrases_in_sentence(left_window, MAX_PHRASE_LENGTH))

    # Create a DDLIB sentence object, which is just a list of DDLIB Word objects
    sent = []
    for i, t in enumerate(tokens):
        sent.append(
            ddlib.Word(
                begin_char_offset=None,
                end_char_offset=None,
                word=t,
                lemma=t.lower(),  # lemma for vietnamese: lowercase
                pos=pos_tags[i],
                ner=ner_tags[i],
                dep_par=dep_parents[i] -
                1,  # Note that as stored from CoreNLP 0 is ROOT, but for DDLIB -1 is ROOT
                dep_label=dep_types[i]))

    # Create DDLIB Span for penalty candidate
    penalty_span = ddlib.Span(begin_word_id=p_begin_index,
                              length=(p_end_index - p_begin_index + 1))

    # Generate the generic features using DDLIB on left and right window
    for feature in ddlib.get_generic_features_mention(sent, penalty_span):
        yield [p_id, feature]

    # Keywords represent non-legal_penalty appears on the left
    if len(NON_PENAL_SIGNALS_LEFT.intersection(phrases_in_sentence_left)) > 0:
        yield [p_id, 'APPEAR_LEFT_KW_NON_LEGAL_PENALTY']

    # "phạt tù" appear on the left of mention
    if "phạt tù" in phrases_in_sentence_left:
        yield [p_id, 'APPEAR_LEFT_PHAT_TU']
Exemplo n.º 7
0
def add_features_generic(mention_id, gene_words, sentence):
    # Use the generic feature library (ONLY!)

    # Load dictionaries for keywords
    ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_var.tsv",  "VARKW")
    ddlib.load_dictionary(
        BASE_DIR + "/dicts/features/gene_knock.tsv",  "KNOCKKW")
    ddlib.load_dictionary(
        BASE_DIR + "/dicts/features/gene_amino.tsv",  "AMINOKW")
    ddlib.load_dictionary(
        BASE_DIR + "/dicts/features/gene_antigene.tsv",  "ANTIGENEKW")
    ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_dna.tsv",  "DNAKW")
    ddlib.load_dictionary(
        BASE_DIR + "/dicts/features/gene_downregulation.tsv",  "DOWNREGKW")
    ddlib.load_dictionary(
        BASE_DIR + "/dicts/features/gene_upregulation.tsv",  "UPREGKW")
    ddlib.load_dictionary(
        BASE_DIR + "/dicts/features/gene_tumor.tsv",  "TUMORKW")
    ddlib.load_dictionary(
        BASE_DIR + "/dicts/features/gene_gene.tsv",  "GENEKW")
    ddlib.load_dictionary(
        BASE_DIR + "/dicts/features/gene_expression.tsv",  "EXPRESSKW")
    # Create the objects used by ddlib. ddlib interface is so ugly.
    obj = dict()
    obj['lemma'] = []
    obj['words'] = []
    obj['ner'] = []
    obj['pos'] = []
    obj['dep_graph'] = []
    for word in sentence.words:
        obj['lemma'].append(word.lemma)
        obj['words'].append(word.word)
        obj['ner'].append(word.ner)
        obj['pos'].append(word.pos)
        obj['dep_graph'].append(
            str(word.dep_parent + 1) + "\t" + word.dep_path + "\t" +
            str(word.in_sent_idx + 1))
    word_obj_list = ddlib.unpack_words(
        obj, lemma='lemma', pos='pos', ner='ner', words='words',
        dep_graph='dep_graph', dep_graph_parser=ddlib.dep_graph_parser_triplet)
    gene_span = ddlib.get_span(gene_words[0].in_sent_idx, len(gene_words))
    features = set()
    for feature in ddlib.get_generic_features_mention(
            word_obj_list, gene_span):
        features.add(feature)
    for feature in features:
        print_feature(sentence.doc_id, mention_id, feature)
def add_features_generic(mention_id, gene_words, sentence):
    # Use the generic feature library (ONLY!)

    # Load dictionaries for keywords
    ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_var.tsv", "VARKW")
    ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_knock.tsv",
                          "KNOCKKW")
    ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_amino.tsv",
                          "AMINOKW")
    ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_antigene.tsv",
                          "ANTIGENEKW")
    ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_dna.tsv", "DNAKW")
    ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_downregulation.tsv",
                          "DOWNREGKW")
    ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_upregulation.tsv",
                          "UPREGKW")
    ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_tumor.tsv",
                          "TUMORKW")
    ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_gene.tsv", "GENEKW")
    ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_expression.tsv",
                          "EXPRESSKW")
    # Create the objects used by ddlib. ddlib interface is so ugly.
    obj = dict()
    obj['lemma'] = []
    obj['words'] = []
    obj['ner'] = []
    obj['pos'] = []
    obj['dep_graph'] = []
    for word in sentence.words:
        obj['lemma'].append(word.lemma)
        obj['words'].append(word.word)
        obj['ner'].append(word.ner)
        obj['pos'].append(word.pos)
        obj['dep_graph'].append(
            str(word.dep_parent + 1) + "\t" + word.dep_path + "\t" +
            str(word.in_sent_idx + 1))
    word_obj_list = ddlib.unpack_words(
        obj,
        lemma='lemma',
        pos='pos',
        ner='ner',
        words='words',
        dep_graph='dep_graph',
        dep_graph_parser=ddlib.dep_graph_parser_triplet)
    gene_span = ddlib.get_span(gene_words[0].in_sent_idx, len(gene_words))
    features = set()
    for feature in ddlib.get_generic_features_mention(word_obj_list,
                                                      gene_span):
        features.add(feature)
    for feature in features:
        print_feature(sentence.doc_id, mention_id, feature)
Exemplo n.º 9
0
#! /usr/bin/env python

import sys, os
import ddlib     # DeepDive python utility

ARR_DELIM = '~^~'

# Load keyword dictionaries using ddlib, for domain-specific features
# Words in "married" dictionary are indicative of marriage
# Words in "non_married" dictionary are indicative of non_marriage
APP_HOME = os.environ['APP_HOME']
ddlib.load_dictionary(APP_HOME + "/udf/dicts/married.txt", dict_id="married")
ddlib.load_dictionary(APP_HOME + "/udf/dicts/non_married.txt", dict_id="non_married")

# For each input tuple
for row in sys.stdin:
  parts = row.strip().split('\t')

  # Get all fields from a row
  words = parts[0].split(ARR_DELIM)
  lemmas = parts[1].split(ARR_DELIM)
  poses = parts[2].split(ARR_DELIM)
  dependencies = parts[3].split(ARR_DELIM)
  ners = parts[4].split(ARR_DELIM)
  relation_id = parts[5]
  p1_start, p1_length, p2_start, p2_length = [int(x) for x in parts[6:]]

  # Get a sentence from ddlib -- array of "Word" objects
  if len(dependencies) == 0:
    print >>sys.stderr, str(relation_id) + '\t' + 'DEP_PATH_EMPTY'
    continue