def extract( mention_id="text", doc_begin_index="int", doc_end_index="int", doc_id="text", position="text", sentence_index="int", tokens="text[]", pos_tags="text[]", ): # Constant # WINDOW_SIZE = 10 # Load keyword dictionaries using ddlib, for domain-specific features # Words in "legal_penalty" dictionary are indicative of marriage # Words in "non_legal_penalty" dictionary are indicative of non_marriage APP_HOME = os.environ['APP_HOME'] ddlib.load_dictionary(APP_HOME + "/udf/dicts/kw_crime.txt", dict_id="crime") ddlib.load_dictionary(APP_HOME + "/udf/dicts/kw_non_crime.txt", dict_id="non_crime") # kw_non_legal_penalty = map(lambda word: word.strip(), open(APP_HOME + "/udf/dicts/kw_non_legal_penalty.txt", 'r').readlines()) # kw_legal_penalty = map(lambda word: word.strip(), open(APP_HOME + "/udf/dicts/kw_legal_penalty.txt", 'r').readlines()) # Non penalty signals on the left of candidate mention # NON_PENAL_SIGNALS_LEFT = frozenset(kw_non_legal_penalty) # Penalty signals on the right of candidate mention # PENAL_SIGNALS_LEFT = frozenset(kw_legal_penalty) WINDOW_SIZE = 10 MAX_PHRASE_LENGTH = 5 # Get all subsequences of left sentence with WINDOW_SIZE = 10 low_tokens = map(lambda token: token.lower(), tokens) left_window = get_left_window(doc_begin_index, low_tokens, WINDOW_SIZE) phrases_in_sentence_left = list( get_all_phrases_in_sentence(left_window, MAX_PHRASE_LENGTH)) # Create a DDLIB sentence object, which is just a list of DDLIB Word objects sent = [] for i, t in enumerate(tokens): sent.append( ddlib.Word( begin_char_offset=None, end_char_offset=None, word=t, lemma=tokens[i], # lemma for vietnamese: lowercase pos=pos_tags[i], ner=None, dep_par= -1, # Note that as stored from CoreNLP 0 is ROOT, but for DDLIB -1 is ROOT dep_label=None)) # Create DDLIB Span for penalty candidate penalty_span = ddlib.Span(begin_word_id=doc_begin_index, length=(doc_end_index - doc_begin_index + 1)) # Generate the generic features using DDLIB on left and right window for feature in ddlib.get_generic_features_mention(sent, penalty_span): yield [mention_id, feature]
def add_features_generic(mention_id, pheno_words, sentence): # Use the generic feature library (ONLY!) # Load dictionaries for keywords ddlib.load_dictionary(BASE_DIR + "/dicts/features/pheno_var.tsv", "VARKW") ddlib.load_dictionary( BASE_DIR + "/dicts/features/pheno_patient.tsv", "PATIENTKW") # Create the objects used by ddlib. ddlib interface is so ugly. obj = dict() obj['lemma'] = [] obj['words'] = [] obj['ner'] = [] obj['pos'] = [] obj['dep_graph'] = [] for word in sentence.words: obj['lemma'].append(word.lemma) obj['words'].append(word.word) obj['ner'].append(word.ner) obj['pos'].append(word.pos) obj['dep_graph'].append( str(word.dep_parent + 1) + "\t" + word.dep_path + "\t" + str(word.in_sent_idx + 1)) word_obj_list = ddlib.unpack_words( obj, lemma='lemma', pos='pos', ner='ner', words='words', dep_graph='dep_graph', dep_graph_parser=ddlib.dep_graph_parser_triplet) gene_span = ddlib.get_span(pheno_words[0].in_sent_idx, len(pheno_words)) features = set() for feature in ddlib.get_generic_features_mention( word_obj_list, gene_span): features.add(feature) for feature in features: print_feature(sentence.doc_id, mention_id, feature)
def extract( p_id="text", e_id="text", p_begin_index="int", p_end_index="int", e_begin_index="int", e_end_index="int", doc_id="text", sent_index="int", tokens="text[]", lemmas="text[]", pos_tags="text[]", ner_tags="text[]", dep_types="text[]", dep_parents="int[]", ): """ Uses DDLIB to generate features for the spouse relation. """ ddlib.load_dictionary(os.path.abspath("../../../job_employ_keyword.txt"), dict_id="has_employment") ddlib.load_dictionary( os.path.abspath("../../../job_no_employ_keyword.txt"), dict_id="no_employment") # Create a DDLIB sentence object, which is just a list of DDLIB Word objects sent = [] for i, t in enumerate(tokens): sent.append( ddlib.Word( begin_char_offset=None, end_char_offset=None, word=t, lemma=lemmas[i], pos=pos_tags[i], ner=ner_tags[i], dep_par=dep_parents[i] - 1, # Note that as stored from CoreNLP 0 is ROOT, but for DDLIB -1 is ROOT dep_label=dep_types[i])) # Create DDLIB Spans for the two mentions p_span = ddlib.Span(begin_word_id=p_begin_index, length=(p_end_index - p_begin_index + 1)) e_span = ddlib.Span(begin_word_id=e_begin_index, length=(e_end_index - e_begin_index + 1)) # Generate the generic features using DDLIB for feature in ddlib.get_generic_features_relation(sent, p_span, e_span): yield [p_id, e_id, feature]
dds = util.create_ddlib_sentence(row) # (1) GENERIC FEATURES from ddlib span = ddlib.Span(begin_word_id=row.mention_wordidxs[0], length=len(row.mention_wordidxs)) features += [(row.doc_id, row.section_id, row.mention_id, feat) \ for feat in ddlib.get_generic_features_mention(dds, span)] # (2) Add the closest verb by raw distance if OPTS.get('closest-verb'): verb_idxs = [i for i, p in enumerate(row.poses) if p.startswith("VB")] if len(verb_idxs) > 0: dists = filter(lambda d : d[0] > 0, \ [(min([abs(i-j) for j in row.mention_wordidxs]), i) for i in verb_idxs]) if len(dists) > 0: verb = row.lemmas[min(dists)[1]] features.append((row.doc_id, row.section_id, row.mention_id, 'NEAREST_VERB_[%s]' % (verb, ))) return features # Load in manually defined keywords onto_path = lambda p: '%s/onto/%s' % (os.environ['GDD_HOME'], p) if __name__ == '__main__': if OPTS.get('sentence-kws'): ddlib.load_dictionary(onto_path('manual/pheno_sentence_keywords.tsv'), dict_id='pheno_kws') util.run_main_tsv(row_parser=parser.parse_tsv_row, row_fn=get_features_for_candidate)
#! /usr/bin/env python import sys, os import ddlib # DeepDive python utility ARR_DELIM = '~^~' # Load keyword dictionaries using ddlib, for domain-specific features # Words in "married" dictionary are indicative of marriage # Words in "non_married" dictionary are indicative of non_marriage BASE_DIR = os.path.dirname(os.path.realpath(__file__)) ddlib.load_dictionary(BASE_DIR + "/dicts/married.txt", dict_id="married") ddlib.load_dictionary(BASE_DIR + "/dicts/non_married.txt", dict_id="non_married") # For each input tuple for row in sys.stdin: parts = row.strip().split('\t') # Get all fields from a row words = parts[0].split(ARR_DELIM) lemmas = parts[1].split(ARR_DELIM) poses = parts[2].split(ARR_DELIM) dependencies = parts[3].split(ARR_DELIM) ners = parts[4].split(ARR_DELIM) relation_id = parts[5] p1_start, p1_length, p2_start, p2_length = [int(x) for x in parts[6:]] # Get a sentence from ddlib -- array of "Word" objects if len(dependencies) == 0: print >>sys.stderr, str(relation_id) + '\t' + 'DEP_PATH_EMPTY' continue
def extract( p_id="text", p_begin_index="int", p_end_index="int", doc_id="text", sent_index="int", tokens="text[]", pos_tags="text[]", ner_tags="text[]", dep_types="text[]", dep_parents="int[]", ): """ Uses DDLIB to generate features for the legal penalty mention """ # Constant # WINDOW_SIZE = 10 # Load keyword dictionaries using ddlib, for domain-specific features # Words in "legal_penalty" dictionary are indicative of marriage # Words in "non_legal_penalty" dictionary are indicative of non_marriage APP_HOME = os.environ['APP_HOME'] ddlib.load_dictionary(APP_HOME + "/udf/dicts/kw_legal_penalty.txt", dict_id="legal_penalty") ddlib.load_dictionary(APP_HOME + "/udf/dicts/kw_non_legal_penalty.txt", dict_id="non_legal_penalty") kw_non_legal_penalty = map( lambda word: word.strip(), open(APP_HOME + "/udf/dicts/kw_non_legal_penalty.txt", 'r').readlines()) # kw_legal_penalty = map(lambda word: word.strip(), open(APP_HOME + "/udf/dicts/kw_legal_penalty.txt", 'r').readlines()) # Non penalty signals on the left of candidate mention NON_PENAL_SIGNALS_LEFT = frozenset(kw_non_legal_penalty) # Penalty signals on the right of candidate mention # PENAL_SIGNALS_LEFT = frozenset(kw_legal_penalty) WINDOW_SIZE = 10 MAX_PHRASE_LENGTH = 5 # Get all subsequences of left sentence with WINDOW_SIZE = 10 low_tokens = map(lambda token: token.lower(), tokens) left_window = get_left_window(p_begin_index, low_tokens, WINDOW_SIZE) phrases_in_sentence_left = list( get_all_phrases_in_sentence(left_window, MAX_PHRASE_LENGTH)) # Create a DDLIB sentence object, which is just a list of DDLIB Word objects sent = [] for i, t in enumerate(tokens): sent.append( ddlib.Word( begin_char_offset=None, end_char_offset=None, word=t, lemma=t.lower(), # lemma for vietnamese: lowercase pos=pos_tags[i], ner=ner_tags[i], dep_par=dep_parents[i] - 1, # Note that as stored from CoreNLP 0 is ROOT, but for DDLIB -1 is ROOT dep_label=dep_types[i])) # Create DDLIB Span for penalty candidate penalty_span = ddlib.Span(begin_word_id=p_begin_index, length=(p_end_index - p_begin_index + 1)) # Generate the generic features using DDLIB on left and right window for feature in ddlib.get_generic_features_mention(sent, penalty_span): yield [p_id, feature] # Keywords represent non-legal_penalty appears on the left if len(NON_PENAL_SIGNALS_LEFT.intersection(phrases_in_sentence_left)) > 0: yield [p_id, 'APPEAR_LEFT_KW_NON_LEGAL_PENALTY'] # "phạt tù" appear on the left of mention if "phạt tù" in phrases_in_sentence_left: yield [p_id, 'APPEAR_LEFT_PHAT_TU']
def add_features_generic(mention_id, gene_words, sentence): # Use the generic feature library (ONLY!) # Load dictionaries for keywords ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_var.tsv", "VARKW") ddlib.load_dictionary( BASE_DIR + "/dicts/features/gene_knock.tsv", "KNOCKKW") ddlib.load_dictionary( BASE_DIR + "/dicts/features/gene_amino.tsv", "AMINOKW") ddlib.load_dictionary( BASE_DIR + "/dicts/features/gene_antigene.tsv", "ANTIGENEKW") ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_dna.tsv", "DNAKW") ddlib.load_dictionary( BASE_DIR + "/dicts/features/gene_downregulation.tsv", "DOWNREGKW") ddlib.load_dictionary( BASE_DIR + "/dicts/features/gene_upregulation.tsv", "UPREGKW") ddlib.load_dictionary( BASE_DIR + "/dicts/features/gene_tumor.tsv", "TUMORKW") ddlib.load_dictionary( BASE_DIR + "/dicts/features/gene_gene.tsv", "GENEKW") ddlib.load_dictionary( BASE_DIR + "/dicts/features/gene_expression.tsv", "EXPRESSKW") # Create the objects used by ddlib. ddlib interface is so ugly. obj = dict() obj['lemma'] = [] obj['words'] = [] obj['ner'] = [] obj['pos'] = [] obj['dep_graph'] = [] for word in sentence.words: obj['lemma'].append(word.lemma) obj['words'].append(word.word) obj['ner'].append(word.ner) obj['pos'].append(word.pos) obj['dep_graph'].append( str(word.dep_parent + 1) + "\t" + word.dep_path + "\t" + str(word.in_sent_idx + 1)) word_obj_list = ddlib.unpack_words( obj, lemma='lemma', pos='pos', ner='ner', words='words', dep_graph='dep_graph', dep_graph_parser=ddlib.dep_graph_parser_triplet) gene_span = ddlib.get_span(gene_words[0].in_sent_idx, len(gene_words)) features = set() for feature in ddlib.get_generic_features_mention( word_obj_list, gene_span): features.add(feature) for feature in features: print_feature(sentence.doc_id, mention_id, feature)
def add_features_generic(mention_id, gene_words, sentence): # Use the generic feature library (ONLY!) # Load dictionaries for keywords ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_var.tsv", "VARKW") ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_knock.tsv", "KNOCKKW") ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_amino.tsv", "AMINOKW") ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_antigene.tsv", "ANTIGENEKW") ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_dna.tsv", "DNAKW") ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_downregulation.tsv", "DOWNREGKW") ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_upregulation.tsv", "UPREGKW") ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_tumor.tsv", "TUMORKW") ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_gene.tsv", "GENEKW") ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_expression.tsv", "EXPRESSKW") # Create the objects used by ddlib. ddlib interface is so ugly. obj = dict() obj['lemma'] = [] obj['words'] = [] obj['ner'] = [] obj['pos'] = [] obj['dep_graph'] = [] for word in sentence.words: obj['lemma'].append(word.lemma) obj['words'].append(word.word) obj['ner'].append(word.ner) obj['pos'].append(word.pos) obj['dep_graph'].append( str(word.dep_parent + 1) + "\t" + word.dep_path + "\t" + str(word.in_sent_idx + 1)) word_obj_list = ddlib.unpack_words( obj, lemma='lemma', pos='pos', ner='ner', words='words', dep_graph='dep_graph', dep_graph_parser=ddlib.dep_graph_parser_triplet) gene_span = ddlib.get_span(gene_words[0].in_sent_idx, len(gene_words)) features = set() for feature in ddlib.get_generic_features_mention(word_obj_list, gene_span): features.add(feature) for feature in features: print_feature(sentence.doc_id, mention_id, feature)
#! /usr/bin/env python import sys, os import ddlib # DeepDive python utility ARR_DELIM = '~^~' # Load keyword dictionaries using ddlib, for domain-specific features # Words in "married" dictionary are indicative of marriage # Words in "non_married" dictionary are indicative of non_marriage APP_HOME = os.environ['APP_HOME'] ddlib.load_dictionary(APP_HOME + "/udf/dicts/married.txt", dict_id="married") ddlib.load_dictionary(APP_HOME + "/udf/dicts/non_married.txt", dict_id="non_married") # For each input tuple for row in sys.stdin: parts = row.strip().split('\t') # Get all fields from a row words = parts[0].split(ARR_DELIM) lemmas = parts[1].split(ARR_DELIM) poses = parts[2].split(ARR_DELIM) dependencies = parts[3].split(ARR_DELIM) ners = parts[4].split(ARR_DELIM) relation_id = parts[5] p1_start, p1_length, p2_start, p2_length = [int(x) for x in parts[6:]] # Get a sentence from ddlib -- array of "Word" objects if len(dependencies) == 0: print >>sys.stderr, str(relation_id) + '\t' + 'DEP_PATH_EMPTY' continue