예제 #1
0
def get_form_features(form, form_type, field_elems=None):
    """
    Return a list of feature dicts, a dict per visible submittable
    field in a <form> element.
    """
    if field_elems is None:
        field_elems = get_fields_to_annotate(form)
    text_before, text_after = get_text_around_elems(form, field_elems)
    res = [_elem_features(elem) for elem in field_elems]

    for idx, elem_feat in enumerate(res):
        if idx == 0:
            elem_feat['is-first'] = True
        if idx == len(res) - 1:
            elem_feat['is-last'] = True

        elem_feat['form-type'] = form_type
        # get text before element
        text = normalize(text_before[field_elems[idx]])
        tokens = tokenize(text)[-6:]
        elem_feat['text-before'] = token_ngrams(tokens, 1, 2)

        # get text after element
        text = normalize(text_after[field_elems[idx]])
        tokens = tokenize(text)[:5]
        elem_feat['text-after'] = token_ngrams(tokens, 1, 2)
        elem_feat['bias'] = 1

    return res
예제 #2
0
def get_form_features(form, form_type, field_elems=None):
    """
    Return a list of feature dicts, a dict per visible submittable
    field in a <form> element.
    """
    if field_elems is None:
        field_elems = get_fields_to_annotate(form)
    text_before, text_after = get_text_around_elems(form, field_elems)
    res = [_elem_features(elem) for elem in field_elems]

    for idx, elem_feat in enumerate(res):
        if idx == 0:
            elem_feat['is-first'] = True
        if idx == len(res)-1:
            elem_feat['is-last'] = True

        elem_feat['form-type'] = form_type
        # get text before element
        text = normalize(text_before[field_elems[idx]])
        tokens = tokenize(text)[-6:]
        elem_feat['text-before'] = token_ngrams(tokens, 1, 2)

        # get text after element
        text = normalize(text_after[field_elems[idx]])
        tokens = tokenize(text)[:5]
        elem_feat['text-after'] = token_ngrams(tokens, 1, 2)
        elem_feat['bias'] = 1

    return res
예제 #3
0
def keyword_text_relevancy(text: str,
                           pos_keywords: List[str],
                           neg_keywords: List[str],
                           max_ngram=1):
    tokens = tokenize(text)
    tokens = set(token_ngrams(tokens, 1, max_ngram))

    def _score(keywords: List[str]) -> float:
        s = sum(int(k in tokens) for k in keywords)
        return _scale_relevancy(s, keywords)

    pos_score = _score(pos_keywords)
    neg_score = _score(neg_keywords)

    return max(0, pos_score - 0.33 * neg_score)
예제 #4
0
def test_token_ngrams(seq, min_n, max_n, result):
    assert token_ngrams(seq, min_n, max_n) == result