def extract(
        doc_id         = "text",
        sentence_index = "int",
        tokens         = "text[]",
        ner_tags       = "text[]",
    ):
    """
    Finds phrases that are continuous words tagged with PERSON.
    """
    ner_tags = transform_ner_tags(ner_tags)
    num_tokens = len(ner_tags)
    print >>sys.stderr, '============================='
    # find all first indexes of series of tokens tagged as PERSON
    first_indexes = (i for i in xrange(num_tokens) if ner_tags[i] == "PERSON" and (i == 0 or ner_tags[i-1] != "PERSON"))
    for begin_index in first_indexes:
        # find the end of the PERSON phrase (consecutive tokens tagged as PERSON)
        end_index = begin_index + 1
        while end_index < num_tokens and ner_tags[end_index] == "PERSON":
            end_index += 1
        end_index -= 1
        # generate a mention identifier
        mention_id = "%s_%d_%d_%d" % (doc_id, sentence_index, begin_index, end_index)
        mention_text = " ".join(map(lambda i: tokens[i], xrange(begin_index, end_index + 1)))
        # Output a tuple for each PERSON phrase
        yield [
            mention_id,
            mention_text,
            doc_id,
            sentence_index,
            begin_index,
            end_index,
        ]
def supervise(
    p1_id="text",
    p1_begin="int",
    p1_end="int",
    p2_id="text",
    p2_begin="int",
    p2_end="int",
    doc_id="text",
    sentence_index="int",
    sentence_text="text",
    tokens="text[]",
    pos_tags="text[]",
    ner_tags="text[]",
    dep_types="text[]",
    dep_token_indexes="int[]",
):

    ner_tags = transform_ner_tags(ner_tags)
    # Constants
    MARRIED = frozenset([u"妻子", u"丈夫", u"老公", u"老婆", u"情侣"])
    FAMILY = frozenset([u"父亲", u"母亲", u"姐姐", u"哥哥", u"侄子", u"爸爸", u"妈妈", u"姐弟", u"姐妹"])
    MAX_DIST = 10

    # Common data objects
    p1_end_idx = min(p1_end, p2_end)
    p2_start_idx = max(p1_begin, p2_begin)
    p2_end_idx = max(p1_end, p2_end)
    intermediate_tokens = tokens[p1_end_idx + 1 : p2_start_idx]
    intermediate_ner_tags = ner_tags[p1_end_idx + 1 : p2_start_idx]
    tail_tokens = tokens[p2_end_idx + 1 :]
    spouse = SpouseLabel(p1_id=p1_id, p2_id=p2_id, label=None, type=None)

    # Rule: Candidates that are too far apart
    if len(intermediate_tokens) > MAX_DIST:
        yield spouse._replace(label=-1, type="neg:far_apart")

    # Rule: Candidates that have a third person in between
    if "PERSON" in intermediate_ner_tags:
        yield spouse._replace(label=-1, type="neg:third_person_between")

    # Rule: Sentences that contain wife/husband in between
    #         (<P1>)([ A-Za-z]+)(wife|husband)([ A-Za-z]+)(<P2>)
    if len(MARRIED.intersection(intermediate_tokens)) > 0:
        yield spouse._replace(label=1, type="pos:wife_husband_between")

    # Rule: Sentences that contain and ... married
    #         (<P1>)(and)?(<P2>)([ A-Za-z]+)(married)
    if (u"和" in intermediate_tokens) and (u"结婚" in tail_tokens):
        yield spouse._replace(label=1, type="pos:married_after")

    # Rule: Sentences that contain familial relations:
    #         (<P1>)([ A-Za-z]+)(brother|stster|father|mother)([ A-Za-z]+)(<P2>)
    if len(FAMILY.intersection(intermediate_tokens)) > 0:
        yield spouse._replace(label=-1, type="neg:familial_between")
def supervise(
        p1_id="text", p1_begin="int", p1_end="int",
        p2_id="text", p2_begin="int", p2_end="int",
        doc_id="text", sentence_index="int", sentence_text="text",
        tokens="text[]", pos_tags="text[]", ner_tags="text[]",
        dep_types="text[]", dep_token_indexes="int[]",
    ):
    
    ner_tags = transform_ner_tags(ner_tags)
    # Constants
    MARRIED = frozenset([u"妻子", u"丈夫", u"老公", u"老婆", u"情侣"])
    FAMILY = frozenset([u"父亲", u"母亲", u"姐姐", u"哥哥", u"侄子", u"爸爸", u"妈妈", u"姐弟", u"姐妹"])
    MAX_DIST = 10

    # Common data objects
    p1_end_idx = min(p1_end, p2_end)
    p2_start_idx = max(p1_begin, p2_begin)
    p2_end_idx = max(p1_end,p2_end)
    intermediate_tokens = tokens[p1_end_idx+1:p2_start_idx]
    intermediate_ner_tags = ner_tags[p1_end_idx+1:p2_start_idx]
    tail_tokens = tokens[p2_end_idx+1:]
    spouse = SpouseLabel(p1_id=p1_id, p2_id=p2_id, label=None, type=None)

    # Rule: Candidates that are too far apart
    if len(intermediate_tokens) > MAX_DIST:
        yield spouse._replace(label=-1, type='neg:far_apart')

    # Rule: Candidates that have a third person in between
    if 'PERSON' in intermediate_ner_tags:
        yield spouse._replace(label=-1, type='neg:third_person_between')

    # Rule: Sentences that contain wife/husband in between
    #         (<P1>)([ A-Za-z]+)(wife|husband)([ A-Za-z]+)(<P2>)
    if len(MARRIED.intersection(intermediate_tokens)) > 0:
        yield spouse._replace(label=1, type='pos:wife_husband_between')

    # Rule: Sentences that contain and ... married
    #         (<P1>)(and)?(<P2>)([ A-Za-z]+)(married)
    if (u"和" in intermediate_tokens) and (u"结婚" in tail_tokens):
        yield spouse._replace(label=1, type='pos:married_after')

    # Rule: Sentences that contain familial relations:
    #         (<P1>)([ A-Za-z]+)(brother|stster|father|mother)([ A-Za-z]+)(<P2>)
    if len(FAMILY.intersection(intermediate_tokens)) > 0:
        yield spouse._replace(label=-1, type='neg:familial_between')
示例#4
0
def extract(
    doc_id="text",
    sentence_index="int",
    tokens="text[]",
    ner_tags="text[]",
):
    """
    Finds phrases that are continuous words tagged with PERSON.
    """
    ner_tags = transform_ner_tags(ner_tags)
    num_tokens = len(ner_tags)
    print >> sys.stderr, '============================='
    # find all first indexes of series of tokens tagged as PERSON
    first_indexes = (
        i for i in xrange(num_tokens)
        if ner_tags[i] == "PERSON" and (i == 0 or ner_tags[i - 1] != "PERSON"))
    for begin_index in first_indexes:
        # find the end of the PERSON phrase (consecutive tokens tagged as PERSON)
        end_index = begin_index + 1
        while end_index < num_tokens and ner_tags[end_index] == "PERSON":
            end_index += 1
        end_index -= 1
        # generate a mention identifier
        mention_id = "%s_%d_%d_%d" % (doc_id, sentence_index, begin_index,
                                      end_index)
        mention_text = " ".join(
            map(lambda i: tokens[i], xrange(begin_index, end_index + 1)))
        # Output a tuple for each PERSON phrase
        yield [
            mention_id,
            mention_text,
            doc_id,
            sentence_index,
            begin_index,
            end_index,
        ]