Exemplo n.º 1
0
def get_features_for_row(row):
    #OPTS = config.GENE['F']
    features = []
    f = Feature(doc_id=row.doc_id,
                section_id=row.section_id,
                mention_id=row.mention_id,
                name=None)

    # (1) Get generic ddlib features
    sentence = util.create_ddlib_sentence(row)
    span = ddlib.Span(begin_word_id=row.mention_wordidxs[0],
                      length=len(row.mention_wordidxs))
    generic_features = [
        f._replace(name=feat)
        for feat in ddlib.get_generic_features_mention(sentence, span)
    ]

    features += generic_features
    features += [f._replace(name=feat) for feat in get_custom_features(row)]

    # (2) Include gene type as a feature
    # Note: including this as feature creates massive overfitting, for obvious reasons
    # We need neg supervision of canonical & noncanonical symbols, then can / should try adding this feature
    """
  for t in ENSEMBL_TYPES:
    if re.search(re.escape(t), row.mention_type, flags=re.I):
      features.append(f._replace(name='GENE_TYPE[%s]' % t))
      break
  """
    return features
def get_features_for_row(row):
    OPTS = config.PHENO_ACRONYMS['F']
    features = []
    f = Feature(doc_id=row.doc_id,
                section_id=row.section_id,
                mention_id=row.mention_id,
                name=None)

    # (1) Get generic ddlib features
    sentence = util.create_ddlib_sentence(row)
    allWordIdxs = row.short_wordidxs + row.long_wordidxs
    start = min(allWordIdxs)
    length = max(allWordIdxs) - start
    span = ddlib.Span(begin_word_id=start, length=length)
    assert len(span) > 0, row
    assert start + length < len(row.words), (start + length, len(row.words),
                                             row)
    generic_features = [
        f._replace(name=feat)
        for feat in ddlib.get_generic_features_mention(sentence, span)
    ]

    # Optionally filter out some generic features
    if OPTS.get('exclude_generic'):
        generic_features = filter(
            lambda feat: not feat.startswith(tuple(OPTS['exclude_generic'])),
            generic_features)

    features += generic_features

    return features
Exemplo n.º 3
0
def get_features_for_candidate(row):
    """Extract features for candidate mention- both generic ones from ddlib & custom features"""
    features = []
    f = Feature(doc_id=row.doc_id,
                section_id=row.section_id,
                relation_id=row.relation_id,
                name=None)
    dds = util.create_ddlib_sentence(row)

    # (1) GENERIC FEATURES from ddlib
    gene_span = ddlib.Span(begin_word_id=row.gene_wordidxs[0],
                           length=len(row.gene_wordidxs))
    pheno_span = ddlib.Span(begin_word_id=row.pheno_wordidxs[0],
                            length=len(row.pheno_wordidxs))
    for feat in ddlib.get_generic_features_relation(dds, gene_span,
                                                    pheno_span):
        if take_feature(feat):
            features.append(f._replace(name=feat))
    features.extend(
        [f._replace(name=feat) for feat in get_custom_features(row, dds)])
    # these seem to be hurting (?)
    # start_span = ddlib.Span(begin_word_id=0, length=4)
    # for feat in ddlib.get_generic_features_mention(dds, start_span, length_bin_size=2):
    #  features.append(f._replace(name='START_SENT_%s' % feat))
    # WITH these custom features, I get a little LESS precision and a little MORE recall (!)
    # features += [f._replace(name=feat) for feat in create_ners_between(row.gene_wordidxs, row.pheno_wordidxs, row.ners)]
    return features
Exemplo n.º 4
0
def get_features_for_candidate(row):
    """Extract features for candidate mention- both generic ones from ddlib & custom features"""
    features = []
    f = Feature(doc_id=row.doc_id,
                section_id=row.section_id,
                relation_id=row.relation_id,
                name=None)
    dds = util.create_ddlib_sentence(row)

    # (1) GENERIC FEATURES from ddlib
    genevar_span = ddlib.Span(begin_word_id=row.genevar_wordidxs[0],
                              length=len(row.genevar_wordidxs))
    pheno_span = ddlib.Span(begin_word_id=row.pheno_wordidxs[0],
                            length=len(row.pheno_wordidxs))
    features += [f._replace(name=feat) \
                      for feat in ddlib.get_generic_features_relation(dds, genevar_span, pheno_span)]
    return features
Exemplo n.º 5
0
def get_features_for_candidate(row):
    """Extract features for candidate mention- both generic ones from ddlib & custom features"""
    features = []
    dds = util.create_ddlib_sentence(row)

    # (1) GENERIC FEATURES from ddlib
    span = ddlib.Span(begin_word_id=row.mention_wordidxs[0],
                      length=len(row.mention_wordidxs))
    features += [(row.doc_id, row.section_id, row.mention_id, feat) \
                      for feat in ddlib.get_generic_features_mention(dds, span)]

    # (2) Add the closest verb by raw distance
    if OPTS.get('closest-verb'):
        verb_idxs = [i for i, p in enumerate(row.poses) if p.startswith("VB")]
        if len(verb_idxs) > 0:
            dists = filter(lambda d : d[0] > 0, \
                           [(min([abs(i-j) for j in row.mention_wordidxs]), i) for i in verb_idxs])
            if len(dists) > 0:
                verb = row.lemmas[min(dists)[1]]
                features.append((row.doc_id, row.section_id, row.mention_id,
                                 'NEAREST_VERB_[%s]' % (verb, )))
    return features