示例#1
0
def get_features_for_candidate(row):
    """Extract features for candidate mention- both generic ones from ddlib & custom features"""
    features = []
    f = Feature(doc_id=row.doc_id,
                section_id=row.section_id,
                relation_id=row.relation_id,
                name=None)
    dds = util.create_ddlib_sentence(row)

    # (1) GENERIC FEATURES from ddlib
    gene_span = ddlib.Span(begin_word_id=row.gene_wordidxs[0],
                           length=len(row.gene_wordidxs))
    pheno_span = ddlib.Span(begin_word_id=row.pheno_wordidxs[0],
                            length=len(row.pheno_wordidxs))
    for feat in ddlib.get_generic_features_relation(dds, gene_span,
                                                    pheno_span):
        if take_feature(feat):
            features.append(f._replace(name=feat))
    features.extend(
        [f._replace(name=feat) for feat in get_custom_features(row, dds)])
    # these seem to be hurting (?)
    # start_span = ddlib.Span(begin_word_id=0, length=4)
    # for feat in ddlib.get_generic_features_mention(dds, start_span, length_bin_size=2):
    #  features.append(f._replace(name='START_SENT_%s' % feat))
    # WITH these custom features, I get a little LESS precision and a little MORE recall (!)
    # features += [f._replace(name=feat) for feat in create_ners_between(row.gene_wordidxs, row.pheno_wordidxs, row.ners)]
    return features
示例#2
0
 def test_tokens_between_spans(self):
     span1 = dd.Span(0, 2)
     span2 = dd.Span(3, 5)
     words_between = dd.tokens_between_spans(self.words, span1, span2)
     self.assertEqual(words_between[:], (False, ["Jake"]))
     words_between = dd.tokens_between_spans(self.words, span2, span1)
     self.assertEqual(words_between[:], (True, ["Jake"]))
     words_between = dd.tokens_between_spans(self.words, span1, span1)
     self.assertEqual(words_between[:], (False, []))
示例#3
0
 def test_tokens_between_spans(self):
     span1 = dd.Span(0, 2)
     span2 = dd.Span(3, 5)
     words_between = dd.tokens_between_spans(self.words, span1, span2)
     self.assertEqual(
         [words_between[0], list(words_between[1])], [False, ["Jake"]])
     words_between = dd.tokens_between_spans(self.words, span2, span1)
     self.assertEqual(
         [words_between[0], list(words_between[1])], [True, ["Jake"]])
     words_between = dd.tokens_between_spans(self.words, span1, span1)
     self.assertEqual(
         [words_between[0], list(words_between[1])], [False, []])
def extract(
    p_id="text",
    e_id="text",
    p_begin_index="int",
    p_end_index="int",
    e_begin_index="int",
    e_end_index="int",
    doc_id="text",
    sent_index="int",
    tokens="text[]",
    lemmas="text[]",
    pos_tags="text[]",
    ner_tags="text[]",
    dep_types="text[]",
    dep_parents="int[]",
):
    """
    Uses DDLIB to generate features for the spouse relation.
    """
    ddlib.load_dictionary(os.path.abspath("../../../job_employ_keyword.txt"),
                          dict_id="has_employment")
    ddlib.load_dictionary(
        os.path.abspath("../../../job_no_employ_keyword.txt"),
        dict_id="no_employment")
    # Create a DDLIB sentence object, which is just a list of DDLIB Word objects
    sent = []
    for i, t in enumerate(tokens):
        sent.append(
            ddlib.Word(
                begin_char_offset=None,
                end_char_offset=None,
                word=t,
                lemma=lemmas[i],
                pos=pos_tags[i],
                ner=ner_tags[i],
                dep_par=dep_parents[i] -
                1,  # Note that as stored from CoreNLP 0 is ROOT, but for DDLIB -1 is ROOT
                dep_label=dep_types[i]))

    # Create DDLIB Spans for the two mentions
    p_span = ddlib.Span(begin_word_id=p_begin_index,
                        length=(p_end_index - p_begin_index + 1))
    e_span = ddlib.Span(begin_word_id=e_begin_index,
                        length=(e_end_index - e_begin_index + 1))

    # Generate the generic features using DDLIB
    for feature in ddlib.get_generic_features_relation(sent, p_span, e_span):
        yield [p_id, e_id, feature]
def get_features_for_row(row):
    OPTS = config.PHENO_ACRONYMS['F']
    features = []
    f = Feature(doc_id=row.doc_id,
                section_id=row.section_id,
                mention_id=row.mention_id,
                name=None)

    # (1) Get generic ddlib features
    sentence = util.create_ddlib_sentence(row)
    allWordIdxs = row.short_wordidxs + row.long_wordidxs
    start = min(allWordIdxs)
    length = max(allWordIdxs) - start
    span = ddlib.Span(begin_word_id=start, length=length)
    assert len(span) > 0, row
    assert start + length < len(row.words), (start + length, len(row.words),
                                             row)
    generic_features = [
        f._replace(name=feat)
        for feat in ddlib.get_generic_features_mention(sentence, span)
    ]

    # Optionally filter out some generic features
    if OPTS.get('exclude_generic'):
        generic_features = filter(
            lambda feat: not feat.startswith(tuple(OPTS['exclude_generic'])),
            generic_features)

    features += generic_features

    return features
示例#6
0
def get_features_for_row(row):
    #OPTS = config.GENE['F']
    features = []
    f = Feature(doc_id=row.doc_id,
                section_id=row.section_id,
                mention_id=row.mention_id,
                name=None)

    # (1) Get generic ddlib features
    sentence = util.create_ddlib_sentence(row)
    span = ddlib.Span(begin_word_id=row.mention_wordidxs[0],
                      length=len(row.mention_wordidxs))
    generic_features = [
        f._replace(name=feat)
        for feat in ddlib.get_generic_features_mention(sentence, span)
    ]

    features += generic_features
    features += [f._replace(name=feat) for feat in get_custom_features(row)]

    # (2) Include gene type as a feature
    # Note: including this as feature creates massive overfitting, for obvious reasons
    # We need neg supervision of canonical & noncanonical symbols, then can / should try adding this feature
    """
  for t in ENSEMBL_TYPES:
    if re.search(re.escape(t), row.mention_type, flags=re.I):
      features.append(f._replace(name='GENE_TYPE[%s]' % t))
      break
  """
    return features
示例#7
0
def extract(
    organization_id="text",
    begin_index="int",
    end_index="int",
    doc_id="text",
    sentence_index="int",
    tokens="text[]",
    pos_tags="text[]",
    dep_types="text[]",
    dep_heads="int[]",
):
    sent = []
    for i, t in enumerate(tokens):
        sent.append(
            ddlib.Word(
                begin_char_offset=None,
                end_char_offset=None,
                word=t,
                lemma=tokens[i],
                pos=pos_tags[i],
                ner=None,
                dep_par=dep_heads[i] -
                1,  # Note that as stored from CoreNLP 0 is ROOT, but for DDLIB -1 is ROOT
                dep_label=dep_types[i]))
    ####
    org_span = ddlib.Span(begin_word_id=begin_index,
                          length=(end_index - begin_index + 1))
    for feature in ddlib.get_generic_features_mention(sent, org_span):
        yield [organization_id, feature]
def extract(
    mention_id="text",
    doc_begin_index="int",
    doc_end_index="int",
    doc_id="text",
    position="text",
    sentence_index="int",
    tokens="text[]",
    pos_tags="text[]",
):
    # Constant
    # WINDOW_SIZE = 10

    # Load keyword dictionaries using ddlib, for domain-specific features
    # Words in "legal_penalty" dictionary are indicative of marriage
    # Words in "non_legal_penalty" dictionary are indicative of non_marriage
    APP_HOME = os.environ['APP_HOME']
    ddlib.load_dictionary(APP_HOME + "/udf/dicts/kw_crime.txt",
                          dict_id="crime")
    ddlib.load_dictionary(APP_HOME + "/udf/dicts/kw_non_crime.txt",
                          dict_id="non_crime")

    # kw_non_legal_penalty = map(lambda word: word.strip(), open(APP_HOME + "/udf/dicts/kw_non_legal_penalty.txt", 'r').readlines())
    # kw_legal_penalty = map(lambda word: word.strip(), open(APP_HOME + "/udf/dicts/kw_legal_penalty.txt", 'r').readlines())
    # Non penalty signals on the left of candidate mention
    # NON_PENAL_SIGNALS_LEFT = frozenset(kw_non_legal_penalty)
    # Penalty signals on the right of candidate mention
    # PENAL_SIGNALS_LEFT = frozenset(kw_legal_penalty)

    WINDOW_SIZE = 10
    MAX_PHRASE_LENGTH = 5

    # Get all subsequences of left sentence with WINDOW_SIZE = 10
    low_tokens = map(lambda token: token.lower(), tokens)
    left_window = get_left_window(doc_begin_index, low_tokens, WINDOW_SIZE)
    phrases_in_sentence_left = list(
        get_all_phrases_in_sentence(left_window, MAX_PHRASE_LENGTH))

    # Create a DDLIB sentence object, which is just a list of DDLIB Word objects
    sent = []
    for i, t in enumerate(tokens):
        sent.append(
            ddlib.Word(
                begin_char_offset=None,
                end_char_offset=None,
                word=t,
                lemma=tokens[i],  # lemma for vietnamese: lowercase
                pos=pos_tags[i],
                ner=None,
                dep_par=
                -1,  # Note that as stored from CoreNLP 0 is ROOT, but for DDLIB -1 is ROOT
                dep_label=None))

    # Create DDLIB Span for penalty candidate
    penalty_span = ddlib.Span(begin_word_id=doc_begin_index,
                              length=(doc_end_index - doc_begin_index + 1))

    # Generate the generic features using DDLIB on left and right window
    for feature in ddlib.get_generic_features_mention(sent, penalty_span):
        yield [mention_id, feature]
示例#9
0
def get_features_for_candidate(row):
    """Extract features for candidate mention- both generic ones from ddlib & custom features"""
    features = []
    f = Feature(doc_id=row.doc_id,
                section_id=row.section_id,
                relation_id=row.relation_id,
                name=None)
    dds = util.create_ddlib_sentence(row)

    # (1) GENERIC FEATURES from ddlib
    genevar_span = ddlib.Span(begin_word_id=row.genevar_wordidxs[0],
                              length=len(row.genevar_wordidxs))
    pheno_span = ddlib.Span(begin_word_id=row.pheno_wordidxs[0],
                            length=len(row.pheno_wordidxs))
    features += [f._replace(name=feat) \
                      for feat in ddlib.get_generic_features_relation(dds, genevar_span, pheno_span)]
    return features
def extract(
        chemical_id             = "text",
        disease_id              = "text",
        chemical_begin_index    = "int",
        chemical_end_index      = "int",
        disease_begin_index     = "int",
        disease_end_index       = "int",
        doc_id                  = "text",
        sent_index              = "int",
        tokens                  = "text[]",
        lemmas                  = "text[]",
        pos_tags                = "text[]",
        ner_tags                = "text[]",
        my_ner_tags             = "text[]",
        my_ner_tags_token_ids   = "int[]",
        dep_types               = "text[]",
        dep_parents             = "int[]",
    ):
    """
    Uses DDLIB to generate features for the chemical-disease relation candidates.
    """

    # creates a dictionary of tags from the sparse my_ner_tags array
    my_ner_tags_dict = { i:tag for i,tag in zip(my_ner_tags_token_ids, my_ner_tags) }

    sent = []
    for i,t in enumerate(tokens):
        sent.append(ddlib.Word(
            begin_char_offset=None,
            end_char_offset=None,
            word=t,
            lemma=lemmas[i],
            pos=pos_tags[i],
            # replace NER tag if one is found for that token in my_ner_tags:
            ner=my_ner_tags_dict[i] if i in my_ner_tags_dict else ner_tags[i],
            dep_par=dep_parents[i] - 1,  # Note that as stored from CoreNLP 0 is ROOT, but for DDLIB -1 is ROOT
            dep_label=dep_types[i]))

    # Create DDLIB Spans for the two person mentions
    chemical_span = ddlib.Span(begin_word_id=chemical_begin_index, length=(chemical_end_index-chemical_begin_index+1))
    disease_span = ddlib.Span(begin_word_id=disease_begin_index, length=(disease_end_index-disease_begin_index+1))

    # Generate the generic features using DDLIB
    for feature in ddlib.get_generic_features_relation(sent, chemical_span, disease_span):
        yield [chemical_id, disease_id, feature]
def extract(
    gene_id="text",
    variation_id="text",
    gene_begin_index="int",
    gene_end_index="int",
    var_begin_index="int",
    var_end_index="int",
    doc_id="text",
    sent_index="int",
    tokens="text[]",
    lemmas="text[]",
    pos_tags="text[]",
    ner_tags="text[]",
    dep_types="text[]",
    dep_parents="int[]",
):
    """
    Uses DDLIB to generate features for the spouse relation.
    """
    # Create a DDLIB sentence object, which is just a list of DDLIB Word objects
    sent = []
    for i, t in enumerate(tokens):
        sent.append(
            ddlib.Word(
                begin_char_offset=None,
                end_char_offset=None,
                word=t,
                lemma=lemmas[i],
                pos=pos_tags[i],
                ner=ner_tags[i],
                dep_par=dep_parents[i] -
                1,  # Note that as stored from CoreNLP 0 is ROOT, but for DDLIB -1 is ROOT
                dep_label=dep_types[i]))

    # Create DDLIB Spans for the gene and variation mentions
    gene_span = ddlib.Span(begin_word_id=gene_begin_index,
                           length=gene_end_index - gene_begin_index)
    variation_span = ddlib.Span(begin_word_id=var_begin_index,
                                length=var_end_index - var_begin_index)

    # Generate the generic features using DDLIB
    for feature in ddlib.get_generic_features_relation(sent, gene_span,
                                                       variation_span):
        yield [gene_id, variation_id, feature]
def extract(
    p1_id="text",
    p2_id="text",
    p1_begin_index="int",
    p1_end_index="int",
    p2_begin_index="int",
    p2_end_index="int",
    doc_id="text",
    sent_index="int",
    tokens="text[]",
    lemmas="text[]",
    pos_tags="text[]",
    ner_tags="text[]",
    dep_types="text[]",
    dep_parents="int[]",
):
    """
    Uses DDLIB to generate features for the relation of MED and ARD.
    """
    # Create a DDLIB sentence object, which is just a list of DDLIB Word objects
    sent = []
    for i, t in enumerate(tokens):
        sent.append(
            ddlib.Word(
                begin_char_offset=None,
                end_char_offset=None,
                word=t,
                lemma=lemmas[i],
                pos=pos_tags[i],
                ner=ner_tags[i],
                dep_par=dep_parents[i] -
                1,  # Note that as stored from CoreNLP 0 is ROOT, but for DDLIB -1 is ROOT
                dep_label=dep_types[i]))

    # Create DDLIB Spans for the two person mentions
    p1_span = ddlib.Span(begin_word_id=p1_begin_index,
                         length=(p1_end_index - p1_begin_index + 1))
    p2_span = ddlib.Span(begin_word_id=p2_begin_index,
                         length=(p2_end_index - p2_begin_index + 1))

    # Generate the generic features using DDLIB
    for feature in ddlib.get_generic_features_relation(sent, p1_span, p2_span):
        yield [p1_id, p2_id, feature]
示例#13
0
def extract(S_id="text",
            O_id="text",
            S_begin_index="int",
            S_end_index="int",
            O_begin_index="int",
            O_end_index="int",
            sent_id="text",
            tokens="text[]",
            pos_tags="text[]",
            ner_tags="text[]",
            dep_types="text[]",
            dep_tokens="int[]"):
    """
    Uses DDLIB to generate features for relation.
    """
    # Create a DDLIB sentence object, which is just a list of DDLIB Word objects
    sent = []
    if len(tokens) != len(pos_tags):
        print >> sys.stderr, '===>>>', sent_id, len(tokens), len(pos_tags)
    for i, t in enumerate(tokens):
        sent.append(
            ddlib.Word(
                begin_char_offset=None,
                end_char_offset=None,
                word=t,
                lemma=tokens[i],
                pos=pos_tags[i],
                ner=ner_tags[i],
                dep_par=dep_tokens[i] -
                1,  # Note that as stored from CoreNLP 0 is ROOT, but for DDLIB -1 is ROOT
                dep_label=dep_types[i]))

    # Create DDLIB Spans for the two person mentions
    S_span = ddlib.Span(begin_word_id=S_begin_index,
                        length=(S_begin_index - S_end_index + 1))
    O_span = ddlib.Span(begin_word_id=O_begin_index,
                        length=(O_begin_index - O_end_index + 1))

    # Generate the generic features using DDLIB
    for feature in ddlib.get_generic_features_relation(sent, S_span, O_span):
        yield [S_id, O_id, feature]
示例#14
0
def get_features_for_candidate(row):
    """Extract features for candidate mention- both generic ones from ddlib & custom features"""
    features = []
    dds = util.create_ddlib_sentence(row)

    # (1) GENERIC FEATURES from ddlib
    span = ddlib.Span(begin_word_id=row.mention_wordidxs[0],
                      length=len(row.mention_wordidxs))
    features += [(row.doc_id, row.section_id, row.mention_id, feat) \
                      for feat in ddlib.get_generic_features_mention(dds, span)]

    # (2) Add the closest verb by raw distance
    if OPTS.get('closest-verb'):
        verb_idxs = [i for i, p in enumerate(row.poses) if p.startswith("VB")]
        if len(verb_idxs) > 0:
            dists = filter(lambda d : d[0] > 0, \
                           [(min([abs(i-j) for j in row.mention_wordidxs]), i) for i in verb_idxs])
            if len(dists) > 0:
                verb = row.lemmas[min(dists)[1]]
                features.append((row.doc_id, row.section_id, row.mention_id,
                                 'NEAREST_VERB_[%s]' % (verb, )))
    return features
def run(doc_id, sent_id, words, lemmas, poses, ners, dep_paths, dep_parents,
        mention_id, wordidxs):
    try:
        import ddlib
    except:
        import os
        DD_HOME = os.environ['DEEPDIVE_HOME']
        from sys import path
        path.append('%s/ddlib' % DD_HOME)
        import ddlib

    def unpack_(begin_char_offsets, end_char_offsets, words, lemmas, poses,
                ners, dep_parents, dep_paths):
        wordobjs = []
        for i in range(0, len(words)):
            wordobjs.append(
                ddlib.Word(
                    begin_char_offset=None,
                    end_char_offset=None,
                    word=words[i],
                    lemma=lemmas[i],
                    pos=poses[i],
                    ner='',  # NER is noisy on medical docs
                    dep_par=dep_parents[i],
                    dep_label=dep_paths[i]))
        return wordobjs

    begin_char_offsets = None
    end_char_offsets = None

    sentence = unpack_(begin_char_offsets, end_char_offsets, words, lemmas,
                       poses, ners, dep_parents, dep_paths)
    span = ddlib.Span(begin_word_id=wordidxs[0], length=len(wordidxs))

    for feature in ddlib.get_generic_features_mention(sentence, span):
        yield doc_id, mention_id, feature
示例#16
0
# File: udf/ext_has_spouse_features.py

# Sample input data (piped into STDIN):
'''
{"p2_length":2,"p1_length":2,"lemma":["Sen.","Barack","Obama","and","he","wife",",","Michelle","Obama",",","have","release","eight","year","of","joint","return","."],"words":["Sen.","Barack","Obama","and","his","wife",",","Michelle","Obama",",","have","released","eight","years","of","joint","returns","."],"relation_id":"118238@10_7_118238@10_1","p1_start_position":7,"p2_start_position":1}
'''

import sys, json
import ddlib     # DeepDive python utility

# For each input tuple
for row in sys.stdin:
  obj = json.loads(row)
  words = obj["words"]
  # Unpack input into tuples.
  span1 = ddlib.Span(begin_word_id=obj['p1_start'], length=obj['p1_length'])
  span2 = ddlib.Span(begin_word_id=obj['p2_start'], length=obj['p2_length'])

  # Features for this pair come in here
  features = set()

  # Feature 1: Bag of words between the two phrases
  words_between = ddlib.tokens_between_spans(words, span1, span2)
  for word in words_between.elements:
    features.add("word_between=" + word)

  # Feature 2: Number of words between the two phrases
  features.add("num_words_between=%s" % len(words_between.elements))

  # Feature 3: Does the last word (last name) match?
  last_word_left = ddlib.materialize_span(words, span1)[-1]
示例#17
0
ARR_DELIM = '~^~'

# For each input tuple
for row in sys.stdin:
  parts = row.strip().split('\t')
  if len(parts) != 6:
    print >>sys.stderr, 'Failed to parse row:', row
    continue

  # Get all fields from a row
  words = parts[0].split(ARR_DELIM)
  relation_id = parts[1]
  p1_start, p1_length, p2_start, p2_length = [int(x) for x in parts[2:]]

  # Unpack input into tuples.
  span1 = ddlib.Span(begin_word_id=p1_start, length=p1_length)
  span2 = ddlib.Span(begin_word_id=p2_start, length=p2_length)

  # Features for this pair come in here
  features = set()

  # Feature 1: Bag of words between the two phrases
  words_between = ddlib.tokens_between_spans(words, span1, span2)
  for word in words_between.elements:
    features.add("word_between=" + word)

  # Feature 2: Number of words between the two phrases
  features.add("num_words_between=%s" % len(words_between.elements))

  # Feature 3: Does the last word (last name) match?
  last_word_left = ddlib.materialize_span(words, span1)[-1]
def extract(
    p_id="text",
    p_begin_index="int",
    p_end_index="int",
    doc_id="text",
    sent_index="int",
    tokens="text[]",
    pos_tags="text[]",
    ner_tags="text[]",
    dep_types="text[]",
    dep_parents="int[]",
):
    """
    Uses DDLIB to generate features for the legal penalty mention
    """
    # Constant
    # WINDOW_SIZE = 10

    # Load keyword dictionaries using ddlib, for domain-specific features
    # Words in "legal_penalty" dictionary are indicative of marriage
    # Words in "non_legal_penalty" dictionary are indicative of non_marriage
    APP_HOME = os.environ['APP_HOME']
    ddlib.load_dictionary(APP_HOME + "/udf/dicts/kw_legal_penalty.txt",
                          dict_id="legal_penalty")
    ddlib.load_dictionary(APP_HOME + "/udf/dicts/kw_non_legal_penalty.txt",
                          dict_id="non_legal_penalty")

    kw_non_legal_penalty = map(
        lambda word: word.strip(),
        open(APP_HOME + "/udf/dicts/kw_non_legal_penalty.txt",
             'r').readlines())
    # kw_legal_penalty = map(lambda word: word.strip(), open(APP_HOME + "/udf/dicts/kw_legal_penalty.txt", 'r').readlines())
    # Non penalty signals on the left of candidate mention
    NON_PENAL_SIGNALS_LEFT = frozenset(kw_non_legal_penalty)
    # Penalty signals on the right of candidate mention
    # PENAL_SIGNALS_LEFT = frozenset(kw_legal_penalty)

    WINDOW_SIZE = 10
    MAX_PHRASE_LENGTH = 5

    # Get all subsequences of left sentence with WINDOW_SIZE = 10
    low_tokens = map(lambda token: token.lower(), tokens)
    left_window = get_left_window(p_begin_index, low_tokens, WINDOW_SIZE)
    phrases_in_sentence_left = list(
        get_all_phrases_in_sentence(left_window, MAX_PHRASE_LENGTH))

    # Create a DDLIB sentence object, which is just a list of DDLIB Word objects
    sent = []
    for i, t in enumerate(tokens):
        sent.append(
            ddlib.Word(
                begin_char_offset=None,
                end_char_offset=None,
                word=t,
                lemma=t.lower(),  # lemma for vietnamese: lowercase
                pos=pos_tags[i],
                ner=ner_tags[i],
                dep_par=dep_parents[i] -
                1,  # Note that as stored from CoreNLP 0 is ROOT, but for DDLIB -1 is ROOT
                dep_label=dep_types[i]))

    # Create DDLIB Span for penalty candidate
    penalty_span = ddlib.Span(begin_word_id=p_begin_index,
                              length=(p_end_index - p_begin_index + 1))

    # Generate the generic features using DDLIB on left and right window
    for feature in ddlib.get_generic_features_mention(sent, penalty_span):
        yield [p_id, feature]

    # Keywords represent non-legal_penalty appears on the left
    if len(NON_PENAL_SIGNALS_LEFT.intersection(phrases_in_sentence_left)) > 0:
        yield [p_id, 'APPEAR_LEFT_KW_NON_LEGAL_PENALTY']

    # "phạt tù" appear on the left of mention
    if "phạt tù" in phrases_in_sentence_left:
        yield [p_id, 'APPEAR_LEFT_PHAT_TU']
示例#19
0
#! /usr/bin/env python
# File: udf/ext_has_spouse_features.py

import sys, json
import ddlib

# For each input tuple
# TODO: Sample Data and the input schema. 
# sample json
for row in sys.stdin:

  # Unpack input into tuples.
  #
  obj = json.loads(row)
  words, lemmas = obj["words"], obj["lemma"]
  span1 = ddlib.Span(begin_word_id=obj['p1.start_position'], length=obj['p1.length'])
  span2 = ddlib.Span(begin_word_id=obj['p2.start_position'], length=obj['p2.length'])

  features = set()

  # Feature 1: Find out if a lemma of marry occurs.
  # A better feature would ensure this is on the dependency path between the two.
  #
  lemma_between = ddlib.tokens_between_spans(lemmas, span1, span2)
  married_words = ('marry', 'widow')
  for lemma in lemma_between.elements:
    if lemma in married_words:
      features.add("important_word=%s" % lemma) 

  # Feature 2: The number of words between the two phrases.
  # Intuition: if they are close by, the link may be stronger.
示例#20
0
 def test_materialize_span(self):
     span1 = dd.Span(0, 3)
     materialized_span = dd.materialize_span(self.words, span1)
     self.assertEqual(materialized_span[:], ["Tanja", "married", "Jake"])