示例#1
0
def test_stop_lang():
    stopper = analysis.RegexTokenizer() | analysis.StopFilter()
    ls = [token.text for token in stopper(u("this is a test"))]
    assert ls == [u("test")]

    es_stopper = analysis.RegexTokenizer() | analysis.StopFilter(lang="es")
    ls = [token.text for token in es_stopper(u("el lapiz es en la mesa"))]
    assert ls == ["lapiz", "mesa"]
示例#2
0
def populate_whoosh(text_dir, whoosh_dir):
    loaded = 0

    ## Create analyzer used for tokenizing and normalizing tokens
    my_analyzer = (analysis.RegexTokenizer() | analysis.LowercaseFilter()
                   | analysis.StopFilter())

    # Create schema
    schema = Schema(url=ID(stored=True),
                    body=TEXT(stored=True, analyzer=my_analyzer))

    # Setup index
    os.makedirs(whoosh_dir, exist_ok=True)
    ix = index.create_in(whoosh_dir, schema)

    # Clear index
    writer = ix.writer()
    writer.commit(mergetype=writing.CLEAR)

    # Index documents
    writer = ix.writer()
    for root, dirs, files in os.walk(text_dir, topdown=False):
        for name in files:
            text_file = os.path.join(root, name)
            with open(text_file, encoding="utf8") as tf:
                body = tf.read()
                url = text_file.replace(text_dir, "")
                writer.add_document(url=url, body=body)
                print("Added", url)
                loaded += 1

    writer.commit()
    print("\n\nLoaded", loaded, "documents")
示例#3
0
def test_biword_stopwords():
    # Note that the stop list is None here
    ana = (analysis.RegexTokenizer()
           | analysis.StopFilter(stoplist=None, minsize=3)
           | analysis.BiWordFilter())

    texts = [t.text for t in ana(u("stuff and then some"))]
    assert texts == ["stuff-and", "and-then", "then-some"]

    # Use a stop list here
    ana = (analysis.RegexTokenizer()
           | analysis.LowercaseFilter()
           | analysis.StopFilter()
           | analysis.BiWordFilter())

    texts = [t.text for t in ana(u("stuff and then some"))]
    assert texts == ["stuff-then", "then-some"]
示例#4
0
def test_composition2():
    ca = analysis.RegexTokenizer() | analysis.LowercaseFilter()
    sa = ca | analysis.StopFilter()
    assert_equal(len(sa), 3)
    assert_equal(sa.__class__.__name__, "CompositeAnalyzer")
    assert_equal(sa[0].__class__.__name__, "RegexTokenizer")
    assert_equal(sa[1].__class__.__name__, "LowercaseFilter")
    assert_equal(sa[2].__class__.__name__, "StopFilter")
    assert_equal([t.text for t in sa(u("The ABC 123"))], ["abc", "123"])
示例#5
0
def test_shingle_stopwords():
    # Note that the stop list is None here
    ana = (analysis.RegexTokenizer()
           | analysis.StopFilter(stoplist=None, minsize=3)
           | analysis.ShingleFilter(size=3))

    texts = [t.text for t
             in ana(u("some other stuff and then some things To Check     "))]
    assert texts == ["some-other-stuff", "other-stuff-and", "stuff-and-then",
                     "and-then-some", "then-some-things", "some-things-Check"]

    # Use a stop list here
    ana = (analysis.RegexTokenizer()
           | analysis.LowercaseFilter()
           | analysis.StopFilter()
           | analysis.ShingleFilter(size=3))

    texts = [t.text for t
             in ana(u("some other stuff and then some things To Check     "))]
    assert texts == ["some-other-stuff", "other-stuff-then", "stuff-then-some",
                     "then-some-things", "some-things-check"]
示例#6
0
def populate_whoosh(text_dir, whoosh_dir):
    loaded = 0

    # Create analyzer used for tokenizing and normalizing tokens
    # 000, 001, 010, 011,
    my_analyzers = [(analysis.RegexTokenizer()),
                    (analysis.RegexTokenizer() | analysis.LowercaseFilter()),
                    (analysis.RegexTokenizer() | analysis.LowercaseFilter()
                     | analysis.StopFilter()), (analysis.StemmingAnalyzer())]
    # Create schemas
    schemas = []
    for my_analyzer in my_analyzers:
        schema = Schema(url=ID(stored=True),
                        body=TEXT(stored=True, analyzer=my_analyzer))
        schemas.append(schema)

    # Setup index
    ixs = []
    for i, my_analyzer in enumerate(my_analyzers):
        whoosh_dir_current = whoosh_dir + str(i) + '/'
        os.makedirs(whoosh_dir_current, exist_ok=True)
        ix = index.create_in(whoosh_dir_current, schemas[i])
        ixs.append(ix)

    # Clear index
    writers = []
    for i, my_analyzer in enumerate(my_analyzer):
        writer = ixs[i].writer()
        writer.commit(mergetype=writing.CLEAR)
        writer = ixs[i].writer()
        writers.append(writer)

    # Index documents
    for root, dirs, files in os.walk(text_dir, topdown=False):
        for name in files:
            text_file = os.path.join(root, name)
            print('.', end='')
            with open(text_file) as tf:
                body = tf.read()
                url = text_file.replace(text_dir, "")
                for writer in writers:
                    writer.add_document(url=url, body=body)
                # print("Added", url)
                loaded += 1

    for writer in writers:
        writer.commit()

    print("\n\nLoaded", loaded, "documents")
示例#7
0
def test_composition3():
    sa = analysis.RegexTokenizer() | analysis.StopFilter()
    assert_equal(sa.__class__.__name__, "CompositeAnalyzer")