def get_features_for_row(row): #OPTS = config.GENE['F'] features = [] f = Feature(doc_id=row.doc_id, section_id=row.section_id, mention_id=row.mention_id, name=None) # (1) Get generic ddlib features sentence = util.create_ddlib_sentence(row) span = ddlib.Span(begin_word_id=row.mention_wordidxs[0], length=len(row.mention_wordidxs)) generic_features = [ f._replace(name=feat) for feat in ddlib.get_generic_features_mention(sentence, span) ] features += generic_features features += [f._replace(name=feat) for feat in get_custom_features(row)] # (2) Include gene type as a feature # Note: including this as feature creates massive overfitting, for obvious reasons # We need neg supervision of canonical & noncanonical symbols, then can / should try adding this feature """ for t in ENSEMBL_TYPES: if re.search(re.escape(t), row.mention_type, flags=re.I): features.append(f._replace(name='GENE_TYPE[%s]' % t)) break """ return features
def get_features_for_row(row): OPTS = config.PHENO_ACRONYMS['F'] features = [] f = Feature(doc_id=row.doc_id, section_id=row.section_id, mention_id=row.mention_id, name=None) # (1) Get generic ddlib features sentence = util.create_ddlib_sentence(row) allWordIdxs = row.short_wordidxs + row.long_wordidxs start = min(allWordIdxs) length = max(allWordIdxs) - start span = ddlib.Span(begin_word_id=start, length=length) assert len(span) > 0, row assert start + length < len(row.words), (start + length, len(row.words), row) generic_features = [ f._replace(name=feat) for feat in ddlib.get_generic_features_mention(sentence, span) ] # Optionally filter out some generic features if OPTS.get('exclude_generic'): generic_features = filter( lambda feat: not feat.startswith(tuple(OPTS['exclude_generic'])), generic_features) features += generic_features return features
def get_features_for_candidate(row): """Extract features for candidate mention- both generic ones from ddlib & custom features""" features = [] f = Feature(doc_id=row.doc_id, section_id=row.section_id, relation_id=row.relation_id, name=None) dds = util.create_ddlib_sentence(row) # (1) GENERIC FEATURES from ddlib gene_span = ddlib.Span(begin_word_id=row.gene_wordidxs[0], length=len(row.gene_wordidxs)) pheno_span = ddlib.Span(begin_word_id=row.pheno_wordidxs[0], length=len(row.pheno_wordidxs)) for feat in ddlib.get_generic_features_relation(dds, gene_span, pheno_span): if take_feature(feat): features.append(f._replace(name=feat)) features.extend( [f._replace(name=feat) for feat in get_custom_features(row, dds)]) # these seem to be hurting (?) # start_span = ddlib.Span(begin_word_id=0, length=4) # for feat in ddlib.get_generic_features_mention(dds, start_span, length_bin_size=2): # features.append(f._replace(name='START_SENT_%s' % feat)) # WITH these custom features, I get a little LESS precision and a little MORE recall (!) # features += [f._replace(name=feat) for feat in create_ners_between(row.gene_wordidxs, row.pheno_wordidxs, row.ners)] return features
def get_features_for_candidate(row): """Extract features for candidate mention- both generic ones from ddlib & custom features""" features = [] f = Feature(doc_id=row.doc_id, section_id=row.section_id, relation_id=row.relation_id, name=None) dds = util.create_ddlib_sentence(row) # (1) GENERIC FEATURES from ddlib genevar_span = ddlib.Span(begin_word_id=row.genevar_wordidxs[0], length=len(row.genevar_wordidxs)) pheno_span = ddlib.Span(begin_word_id=row.pheno_wordidxs[0], length=len(row.pheno_wordidxs)) features += [f._replace(name=feat) \ for feat in ddlib.get_generic_features_relation(dds, genevar_span, pheno_span)] return features
def get_features_for_candidate(row): """Extract features for candidate mention- both generic ones from ddlib & custom features""" features = [] dds = util.create_ddlib_sentence(row) # (1) GENERIC FEATURES from ddlib span = ddlib.Span(begin_word_id=row.mention_wordidxs[0], length=len(row.mention_wordidxs)) features += [(row.doc_id, row.section_id, row.mention_id, feat) \ for feat in ddlib.get_generic_features_mention(dds, span)] # (2) Add the closest verb by raw distance if OPTS.get('closest-verb'): verb_idxs = [i for i, p in enumerate(row.poses) if p.startswith("VB")] if len(verb_idxs) > 0: dists = filter(lambda d : d[0] > 0, \ [(min([abs(i-j) for j in row.mention_wordidxs]), i) for i in verb_idxs]) if len(dists) > 0: verb = row.lemmas[min(dists)[1]] features.append((row.doc_id, row.section_id, row.mention_id, 'NEAREST_VERB_[%s]' % (verb, ))) return features