def extract( doc_id = "text", sentence_index = "int", tokens = "text[]", ner_tags = "text[]", ): """ Finds phrases that are continuous words tagged with PERSON. """ ner_tags = transform_ner_tags(ner_tags) num_tokens = len(ner_tags) print >>sys.stderr, '=============================' # find all first indexes of series of tokens tagged as PERSON first_indexes = (i for i in xrange(num_tokens) if ner_tags[i] == "PERSON" and (i == 0 or ner_tags[i-1] != "PERSON")) for begin_index in first_indexes: # find the end of the PERSON phrase (consecutive tokens tagged as PERSON) end_index = begin_index + 1 while end_index < num_tokens and ner_tags[end_index] == "PERSON": end_index += 1 end_index -= 1 # generate a mention identifier mention_id = "%s_%d_%d_%d" % (doc_id, sentence_index, begin_index, end_index) mention_text = " ".join(map(lambda i: tokens[i], xrange(begin_index, end_index + 1))) # Output a tuple for each PERSON phrase yield [ mention_id, mention_text, doc_id, sentence_index, begin_index, end_index, ]
def supervise( p1_id="text", p1_begin="int", p1_end="int", p2_id="text", p2_begin="int", p2_end="int", doc_id="text", sentence_index="int", sentence_text="text", tokens="text[]", pos_tags="text[]", ner_tags="text[]", dep_types="text[]", dep_token_indexes="int[]", ): ner_tags = transform_ner_tags(ner_tags) # Constants MARRIED = frozenset([u"妻子", u"丈夫", u"老公", u"老婆", u"情侣"]) FAMILY = frozenset([u"父亲", u"母亲", u"姐姐", u"哥哥", u"侄子", u"爸爸", u"妈妈", u"姐弟", u"姐妹"]) MAX_DIST = 10 # Common data objects p1_end_idx = min(p1_end, p2_end) p2_start_idx = max(p1_begin, p2_begin) p2_end_idx = max(p1_end, p2_end) intermediate_tokens = tokens[p1_end_idx + 1 : p2_start_idx] intermediate_ner_tags = ner_tags[p1_end_idx + 1 : p2_start_idx] tail_tokens = tokens[p2_end_idx + 1 :] spouse = SpouseLabel(p1_id=p1_id, p2_id=p2_id, label=None, type=None) # Rule: Candidates that are too far apart if len(intermediate_tokens) > MAX_DIST: yield spouse._replace(label=-1, type="neg:far_apart") # Rule: Candidates that have a third person in between if "PERSON" in intermediate_ner_tags: yield spouse._replace(label=-1, type="neg:third_person_between") # Rule: Sentences that contain wife/husband in between # (<P1>)([ A-Za-z]+)(wife|husband)([ A-Za-z]+)(<P2>) if len(MARRIED.intersection(intermediate_tokens)) > 0: yield spouse._replace(label=1, type="pos:wife_husband_between") # Rule: Sentences that contain and ... married # (<P1>)(and)?(<P2>)([ A-Za-z]+)(married) if (u"和" in intermediate_tokens) and (u"结婚" in tail_tokens): yield spouse._replace(label=1, type="pos:married_after") # Rule: Sentences that contain familial relations: # (<P1>)([ A-Za-z]+)(brother|stster|father|mother)([ A-Za-z]+)(<P2>) if len(FAMILY.intersection(intermediate_tokens)) > 0: yield spouse._replace(label=-1, type="neg:familial_between")
def supervise( p1_id="text", p1_begin="int", p1_end="int", p2_id="text", p2_begin="int", p2_end="int", doc_id="text", sentence_index="int", sentence_text="text", tokens="text[]", pos_tags="text[]", ner_tags="text[]", dep_types="text[]", dep_token_indexes="int[]", ): ner_tags = transform_ner_tags(ner_tags) # Constants MARRIED = frozenset([u"妻子", u"丈夫", u"老公", u"老婆", u"情侣"]) FAMILY = frozenset([u"父亲", u"母亲", u"姐姐", u"哥哥", u"侄子", u"爸爸", u"妈妈", u"姐弟", u"姐妹"]) MAX_DIST = 10 # Common data objects p1_end_idx = min(p1_end, p2_end) p2_start_idx = max(p1_begin, p2_begin) p2_end_idx = max(p1_end,p2_end) intermediate_tokens = tokens[p1_end_idx+1:p2_start_idx] intermediate_ner_tags = ner_tags[p1_end_idx+1:p2_start_idx] tail_tokens = tokens[p2_end_idx+1:] spouse = SpouseLabel(p1_id=p1_id, p2_id=p2_id, label=None, type=None) # Rule: Candidates that are too far apart if len(intermediate_tokens) > MAX_DIST: yield spouse._replace(label=-1, type='neg:far_apart') # Rule: Candidates that have a third person in between if 'PERSON' in intermediate_ner_tags: yield spouse._replace(label=-1, type='neg:third_person_between') # Rule: Sentences that contain wife/husband in between # (<P1>)([ A-Za-z]+)(wife|husband)([ A-Za-z]+)(<P2>) if len(MARRIED.intersection(intermediate_tokens)) > 0: yield spouse._replace(label=1, type='pos:wife_husband_between') # Rule: Sentences that contain and ... married # (<P1>)(and)?(<P2>)([ A-Za-z]+)(married) if (u"和" in intermediate_tokens) and (u"结婚" in tail_tokens): yield spouse._replace(label=1, type='pos:married_after') # Rule: Sentences that contain familial relations: # (<P1>)([ A-Za-z]+)(brother|stster|father|mother)([ A-Za-z]+)(<P2>) if len(FAMILY.intersection(intermediate_tokens)) > 0: yield spouse._replace(label=-1, type='neg:familial_between')
def extract( doc_id="text", sentence_index="int", tokens="text[]", ner_tags="text[]", ): """ Finds phrases that are continuous words tagged with PERSON. """ ner_tags = transform_ner_tags(ner_tags) num_tokens = len(ner_tags) print >> sys.stderr, '=============================' # find all first indexes of series of tokens tagged as PERSON first_indexes = ( i for i in xrange(num_tokens) if ner_tags[i] == "PERSON" and (i == 0 or ner_tags[i - 1] != "PERSON")) for begin_index in first_indexes: # find the end of the PERSON phrase (consecutive tokens tagged as PERSON) end_index = begin_index + 1 while end_index < num_tokens and ner_tags[end_index] == "PERSON": end_index += 1 end_index -= 1 # generate a mention identifier mention_id = "%s_%d_%d_%d" % (doc_id, sentence_index, begin_index, end_index) mention_text = " ".join( map(lambda i: tokens[i], xrange(begin_index, end_index + 1))) # Output a tuple for each PERSON phrase yield [ mention_id, mention_text, doc_id, sentence_index, begin_index, end_index, ]