Пример #1
0
def test_contains_extractor_with_list():
    text = ["Simple", "is", "better", "than", "complex"]
    features = contains_extractor(text)
    assert_true(features['contains(Simple)'])
    assert_false(features.get("contains(simple)", False))
    assert_true(features['contains(complex)'])
    assert_false(features.get("contains(derp)", False))
Пример #2
0
def test_contains_extractor_with_string():
    text = "Simple is better than complex"
    features = contains_extractor(text)
    assert_true(features["contains(Simple)"])
    assert_false(features.get('contains(simple)', False))
    assert_true(features['contains(complex)'])
    assert_false(features.get("contains(derp)", False))
Пример #3
0
def extractor_base(document, train_set=None):
    """ Renvoyer des propriétés du texte """
    features = contains_extractor(document, train_set)
    features['has_digit_sequence(7)'] = bool(re.search(REGEX_DIGIT_SEQUENCE_7, document))
    features['has_digit_sequence(4)'] = bool(re.search(REGEX_DIGIT_SEQUENCE_4, document))
    features['has_digit_sequence(2)'] = bool(re.search(REGEX_DIGIT_SEQUENCE_2, document))
    features['has_digits(7)'] = bool(re.search(REGEX_DIGIT_PRESENCE_7, document))
    features['has_curency()'] = bool(re.search(REGEX_CURRENCY_PRESENCE, document))
    features['has_email_markers()'] = bool(re.search(REGEX_EMAIL_MARKERS, document))
    features['has_hyperlink()'] = bool(re.search(REGEX_HYPERLINK_MARKER, document))
    return features