def get_form_features(form, form_type, field_elems=None): """ Return a list of feature dicts, a dict per visible submittable field in a <form> element. """ if field_elems is None: field_elems = get_fields_to_annotate(form) text_before, text_after = get_text_around_elems(form, field_elems) res = [_elem_features(elem) for elem in field_elems] for idx, elem_feat in enumerate(res): if idx == 0: elem_feat['is-first'] = True if idx == len(res) - 1: elem_feat['is-last'] = True elem_feat['form-type'] = form_type # get text before element text = normalize(text_before[field_elems[idx]]) tokens = tokenize(text)[-6:] elem_feat['text-before'] = token_ngrams(tokens, 1, 2) # get text after element text = normalize(text_after[field_elems[idx]]) tokens = tokenize(text)[:5] elem_feat['text-after'] = token_ngrams(tokens, 1, 2) elem_feat['bias'] = 1 return res
def get_form_features(form, form_type, field_elems=None): """ Return a list of feature dicts, a dict per visible submittable field in a <form> element. """ if field_elems is None: field_elems = get_fields_to_annotate(form) text_before, text_after = get_text_around_elems(form, field_elems) res = [_elem_features(elem) for elem in field_elems] for idx, elem_feat in enumerate(res): if idx == 0: elem_feat['is-first'] = True if idx == len(res)-1: elem_feat['is-last'] = True elem_feat['form-type'] = form_type # get text before element text = normalize(text_before[field_elems[idx]]) tokens = tokenize(text)[-6:] elem_feat['text-before'] = token_ngrams(tokens, 1, 2) # get text after element text = normalize(text_after[field_elems[idx]]) tokens = tokenize(text)[:5] elem_feat['text-after'] = token_ngrams(tokens, 1, 2) elem_feat['bias'] = 1 return res
def keyword_text_relevancy(text: str, pos_keywords: List[str], neg_keywords: List[str], max_ngram=1): tokens = tokenize(text) tokens = set(token_ngrams(tokens, 1, max_ngram)) def _score(keywords: List[str]) -> float: s = sum(int(k in tokens) for k in keywords) return _scale_relevancy(s, keywords) pos_score = _score(pos_keywords) neg_score = _score(neg_keywords) return max(0, pos_score - 0.33 * neg_score)
def test_token_ngrams(seq, min_n, max_n, result): assert token_ngrams(seq, min_n, max_n) == result