Exemplo n.º 1
0
def test_double_metaphone():
    from whoosh.lang.dmetaphone import double_metaphone

    names = {'maurice': ('MRS', None),
             'aubrey': ('APR', None),
             'cambrillo': ('KMPRL', 'KMPR'),
             'heidi': ('HT', None),
             'katherine': ('K0RN', 'KTRN'),
             'Thumbail': ('0MPL', 'TMPL'),
             'catherine': ('K0RN', 'KTRN'),
             'richard': ('RXRT', 'RKRT'),
             'bob': ('PP', None),
             'eric': ('ARK', None),
             'geoff': ('JF', 'KF'),
             'Through': ('0R', 'TR'),
             'Schwein': ('XN', 'XFN'),
             'dave': ('TF', None),
             'ray': ('R', None),
             'steven': ('STFN', None),
             'bryce': ('PRS', None),
             'randy': ('RNT', None),
             'bryan': ('PRN', None),
             'Rapelje': ('RPL', None),
             'brian': ('PRN', None),
             'otto': ('AT', None),
             'auto': ('AT', None),
             'Dallas': ('TLS', None),
             'maisey': ('MS', None),
             'zhang': ('JNK', None),
             'Chile': ('XL', None),
             'Jose': ('HS', None),
             'Arnow': ('ARN', 'ARNF'),
             'solilijs': ('SLLS', None),
             'Parachute': ('PRKT', None),
             'Nowhere': ('NR', None),
             'Tux': ('TKS', None)}

    dmn = name = None
    for name in names.keys():
        dmn = double_metaphone(name)
    assert dmn == names[name]

    mf = (analysis.RegexTokenizer()
          | analysis.LowercaseFilter()
          | analysis.DoubleMetaphoneFilter())
    results = [(t.text, t.boost) for t in mf(u("Spruce View"))]
    assert results == [('SPRS', 1.0), ('F', 1.0), ('FF', 0.5)]

    mf = (analysis.RegexTokenizer()
          | analysis.LowercaseFilter()
          | analysis.DoubleMetaphoneFilter(combine=True))
    results = [(t.text, t.boost) for t in mf(u("Spruce View"))]
    assert results == [('spruce', 1.0), ('SPRS', 1.0), ('view', 1.0),
                       ('F', 1.0), ('FF', 0.5)]

    namefield = fields.TEXT(analyzer=mf)
    texts = list(namefield.process_text(u("Spruce View"), mode="query"))
    assert texts == [u('spruce'), 'SPRS', u('view'), 'F', 'FF']
Exemplo n.º 2
0
class Organization(fields.SchemaClass):
    # numero_de_da : Numéro de la Déclaration d'Activité -
    numero_de_da = fields.ID(stored=True, unique=True)
    # form_total : Nombre de formateurs -
    form_total = fields.NUMERIC(stored=True)
    # da_siren : Numéro de SIREN de la structure -
    da_siren = fields.ID(stored=True, unique=True)
    # da_no_etab : Numéro d'établissement de la structure -
    da_no_etab = fields.ID(stored=True)
    # da_raison_sociale : Raison Sociale -
    da_raison_sociale = fields.TEXT(stored=True,
                                    analyzer=ngram_analyzer,
                                    phrase=False)
    # adr_rue_physique : Voie de l'adresse physique -
    adr_rue_physique = fields.TEXT(stored=True)
    # adr_rue_complement_physique : Complément de l'adresse physique -
    adr_rue_complement_physique = fields.TEXT(stored=True)
    # adr_code_postal_physique : Code postal de l'adresse physique -
    adr_code_postal_physique = fields.ID(stored=True)
    # adr_ville_physique : Ville de l'adresse physique -
    adr_ville_physique = fields.TEXT(stored=True)
    # adr_rue_postale : Voie de l'adresse postale -
    adr_rue_postale = fields.TEXT(stored=True)
    # adr_rue_complement_postale : Complément de l'adresse postale -
    adr_rue_complement_postale = fields.TEXT(stored=True)
    # adr_code_postal_postale : Code postal de l'adresse postale -
    adr_code_postal_postale = fields.ID(stored=True)
    # adr_ville_postale : Ville de l'adresse postale
    adr_ville_postale = fields.TEXT(stored=True)
Exemplo n.º 3
0
def test_sort_text_field():
    domain = (("Visual Display of Quantitative Information, The", 10),
              ("Envisioning Information", 10),
              ("Visual Explanations", 10),
              ("Beautiful Evidence", -10),
              ("Visual and Statistical Thinking", -10),
              ("Cognitive Style of Powerpoint", -10))
    sorted_titles = sorted(d[0] for d in domain)

    schema = fields.Schema(title=fields.TEXT(stored=True, sortable=True),
                           num=fields.NUMERIC(sortable=True))

    def test(ix):
        with ix.searcher() as s:
            # Sort by title
            r = s.search(query.Every(), sortedby="title")
            assert [hit["title"] for hit in r] == sorted_titles

            # Sort by reverse title
            facet = sorting.FieldFacet("title", reverse=True)
            r = s.search(query.Every(), sortedby=facet)
            assert [hit["title"] for hit in r] == list(reversed(sorted_titles))

            # Sort by num (-10 to 10) first, and within that, by reverse title
            facet = sorting.MultiFacet()
            facet.add_field("num")
            facet.add_field("title", reverse=True)

            r = s.search(query.Every(), sortedby=facet)
            target = ["Visual and Statistical Thinking",
                      "Cognitive Style of Powerpoint",
                      "Beautiful Evidence",
                      "Visual Explanations",
                      "Visual Display of Quantitative Information, The",
                      "Envisioning Information",
                      ]
            assert [hit["title"] for hit in r] == target

    # Single segment
    ix = RamStorage().create_index(schema)
    with ix.writer() as w:
        for title, num in domain:
            w.add_document(title=u(title), num=num)
    test(ix)

    # Multisegment
    ix = RamStorage().create_index(schema)
    # Segment 1
    with ix.writer() as w:
        for title, num in domain[:3]:
            w.add_document(title=u(title), num=num)
    # Segment 2
    with ix.writer() as w:
        for title, num in domain[3:]:
            w.add_document(title=u(title), num=num)
        w.merge = False
    test(ix)
Exemplo n.º 4
0
class WikiSchema(fields.SchemaClass):
    '''This describes the content that will be stored in the search index.'''

    # The field boost helps wiki page paths show more prevalently in results
    # since they will also be used as links in the content of other pages.
    path = fields.ID(unique=True, field_boost=2.0, stored=True)

    # The content is stored so that highlights can be extracted to display.
    content = fields.TEXT(stored=True)
Exemplo n.º 5
0
def test_no_add():
    check_multi()
    from whoosh.multiproc import MpWriter

    schema = fields.Schema(text=fields.TEXT(stored=True, spelling=True,
                                            vector=True))
    with TempIndex(schema) as ix:
        with ix.writer(procs=3) as w:
            assert type(w) == MpWriter
Exemplo n.º 6
0
    def _mk_schema(self, dsinfo):
        from whoosh import fields as wf
        from whoosh.analysis import SimpleAnalyzer

        # haven for terms that have been found to be undefined
        # (for faster decision-making upon next encounter)
        # this will harvest all discovered term definitions
        definitions = {
            '@id': 'unique identifier of an entity',
            # TODO make proper JSON-LD definition
            'path':
            'path name of an entity relative to the searched base dataset',
            # TODO make proper JSON-LD definition
            'parentds': 'path of the datasets that contains an entity',
            # 'type' will not come from a metadata field, hence will not be detected
            'type': 'type of a record',
        }

        schema_fields = {
            n.lstrip('@'): wf.ID(stored=True, unique=n == '@id')
            for n in definitions
        }

        lgr.debug('Scanning for metadata keys')
        # quick 1st pass over all dataset to gather the needed schema fields
        log_progress(
            lgr.info,
            'idxschemabuild',
            'Start building search schema',
            total=len(dsinfo),
            label='Building search schema',
            unit=' Datasets',
        )
        for res in query_aggregated_metadata(
                # XXX TODO After #2156 datasets may not necessarily carry all
                # keys in the "unique" summary
                reporton='datasets',
                ds=self.ds,
                aps=[dict(path=self.ds.path, type='dataset')],
                recursive=True):
            meta = res.get('metadata', {})
            # no stringification of values for speed, we do not need/use the
            # actual values at this point, only the keys
            idxd = _meta2autofield_dict(meta, val2str=False)

            for k in idxd:
                schema_fields[k] = wf.TEXT(stored=False,
                                           analyzer=SimpleAnalyzer())
            log_progress(lgr.info,
                         'idxschemabuild',
                         'Scanned dataset at %s',
                         res['path'],
                         update=1,
                         increment=True)
        log_progress(lgr.info, 'idxschemabuild', 'Done building search schema')

        self.schema = wf.Schema(**schema_fields)
Exemplo n.º 7
0
def test_free_dates():
    a = analysis.StandardAnalyzer(stoplist=None)
    schema = fields.Schema(text=fields.TEXT(analyzer=a), date=fields.DATETIME)
    qp = qparser.QueryParser("text", schema)
    basedate = datetime(2010, 9, 20, 15, 16, 6, 454000)
    qp.add_plugin(dateparse.DateParserPlugin(basedate, free=True))

    q = qp.parse(u("hello date:last tuesday"))
    assert_equal(q.__class__, query.And)
    assert_equal(len(q), 2)
    assert_equal(q[0].__class__, query.Term)
    assert_equal(q[0].text, "hello")
    assert_equal(q[1].__class__, query.DateRange)
    assert_equal(q[1].startdate, adatetime(2010, 9, 14).floor())
    assert_equal(q[1].enddate, adatetime(2010, 9, 14).ceil())

    q = qp.parse(u("date:mar 29 1972 hello"))
    assert_equal(q.__class__, query.And)
    assert_equal(len(q), 2)
    assert_equal(q[0].__class__, query.DateRange)
    assert_equal(q[0].startdate, adatetime(1972, 3, 29).floor())
    assert_equal(q[0].enddate, adatetime(1972, 3, 29).ceil())
    assert_equal(q[1].__class__, query.Term)
    assert_equal(q[1].text, "hello")

    q = qp.parse(u("date:2005 march 2"))
    assert_equal(q.__class__, query.DateRange)
    assert_equal(q.startdate, adatetime(2005, 3, 2).floor())
    assert_equal(q.enddate, adatetime(2005, 3, 2).ceil())

    q = qp.parse(u("date:'2005' march 2"))
    assert_equal(q.__class__, query.And)
    assert_equal(len(q), 3)
    assert_equal(q[0].__class__, query.DateRange)
    assert_equal(q[0].startdate, adatetime(2005).floor())
    assert_equal(q[0].enddate, adatetime(2005).ceil())
    assert_equal(q[1].__class__, query.Term)
    assert_equal(q[1].fieldname, "text")
    assert_equal(q[1].text, "march")

    q = qp.parse(u("date:march 24 to dec 12"))
    assert_equal(q.__class__, query.DateRange)
    assert_equal(q.startdate, adatetime(2010, 3, 24).floor())
    assert_equal(q.enddate, adatetime(2010, 12, 12).ceil())

    q = qp.parse(u("date:5:10pm"))
    assert_equal(q.__class__, query.DateRange)
    assert_equal(q.startdate, adatetime(2010, 9, 20, 17, 10).floor())
    assert_equal(q.enddate, adatetime(2010, 9, 20, 17, 10).ceil())

    q = qp.parse(u("(date:30 june OR date:10 july) quick"))
    assert_equal(q.__class__, query.And)
    assert_equal(len(q), 2)
    assert_equal(q[0].__class__, query.Or)
    assert_equal(q[0][0].__class__, query.DateRange)
    assert_equal(q[0][1].__class__, query.DateRange)
Exemplo n.º 8
0
def make_index(basedir, ixdir, procs=4, limitmb=128, multisegment=True,
               glob="*.mrc"):
    if not os.path.exists(ixdir):
        os.mkdir(ixdir)

    # Multi-lingual stop words
    stoplist = (analysis.STOP_WORDS
                | set("de la der und le die et en al no von di du da "
                      "del zur ein".split()))
    # Schema
    ana = analysis.StemmingAnalyzer(stoplist=stoplist)
    schema = fields.Schema(title=fields.TEXT(analyzer=ana),
                           author=fields.TEXT(phrase=False),
                           subject=fields.TEXT(analyzer=ana, phrase=False),
                           file=fields.STORED, pos=fields.STORED,
                           )

    # MARC fields to extract
    mfields = set(subjectfields)  # Subjects
    mfields.update("100 110 111".split())  # Author
    mfields.add("245")  # Title

    print("Indexing with %d processor(s) and %d MB per processor"
          % (procs, limitmb))
    c = 0
    t = now()
    ix = index.create_in(ixdir, schema)
    with ix.writer(procs=procs, limitmb=limitmb,
                   multisegment=multisegment) as w:
        filenames = [filename for filename in os.listdir(basedir)
                     if fnmatch.fnmatch(filename, glob)]
        for filename in filenames:
            path = os.path.join(basedir, filename)
            print("Indexing", path)
            f = open(path, 'rb')
            for x, pos in read_file(f, mfields):
                w.add_document(title=uni(title(x)), author=uni(author(x)),
                               subject=uni(subjects(x)),
                               file=filename, pos=pos)
                c += 1
            f.close()
        print("Committing...")
    print("Indexed %d records in %0.02f minutes" % (c, (now() - t) / 60.0))
Exemplo n.º 9
0
def test_bypass_stemming2():
    ana = analysis.StemmingAnalyzer()
    schema = fields.Schema(content=fields.TEXT(analyzer=ana, spelling=True))

    ix = RamStorage().create_index(schema)
    with ix.writer() as w:
        w.add_document(
            content=u("IPFSTD1 IPFSTD_kdwq134 Kaminski-all Study00:00:00"))
        w.add_document(content=u("IPFSTD1 IPFSTD_kdwq134 Kaminski-all Study"))
        w.add_document(content=u("This is the first document we've added!"))
Exemplo n.º 10
0
def test_token_boost():
    from whoosh.analysis import RegexTokenizer, DoubleMetaphoneFilter
    ana = RegexTokenizer() | DoubleMetaphoneFilter()
    field = fields.TEXT(analyzer=ana, phrase=False)
    results = sorted(field.index(u("spruce view")))
    assert results == [
        (b('F'), 1, 1.0, b('\x00\x00\x00\x01')),
        (b('FF'), 1, 0.5, b('\x00\x00\x00\x01')),
        (b('SPRS'), 1, 1.0, b('\x00\x00\x00\x01')),
    ]
Exemplo n.º 11
0
def test_stored_fields():
    codec = default_codec()
    fieldobj = fields.TEXT(stored=True)
    with TempStorage("storedfields") as st:
        seg = codec.new_segment(st, "test")

        dw = codec.per_document_writer(st, seg)
        dw.start_doc(0)
        dw.add_field("a", fieldobj, "hello", 1)
        dw.add_field("b", fieldobj, "there", 1)
        dw.finish_doc()

        dw.start_doc(1)
        dw.add_field("a", fieldobj, "one", 1)
        dw.add_field("b", fieldobj, "two", 1)
        dw.add_field("c", fieldobj, "three", 1)
        dw.finish_doc()

        dw.start_doc(2)
        dw.finish_doc()

        dw.start_doc(3)
        dw.add_field("a", fieldobj, "alfa", 1)
        dw.add_field("b", fieldobj, "bravo", 1)
        dw.finish_doc()

        dw.close()
        seg.set_doc_count(4)

        pdr = codec.per_document_reader(st, seg)
        assert pdr.doc_count_all() == 4
        assert pdr.stored_fields(0) == {"a": "hello", "b": "there"}
        # Note: access out of order
        assert pdr.stored_fields(3), {"a": "alfa", "b": "bravo"}
        assert pdr.stored_fields(1) == {"a": "one", "b": "two", "c": "three"}

        sfs = list(pdr.all_stored_fields())
        assert len(sfs) == 4
        assert sfs == [
            {
                "a": "hello",
                "b": "there"
            },
            {
                "a": "one",
                "b": "two",
                "c": "three"
            },
            {},
            {
                "a": "alfa",
                "b": "bravo"
            },
        ]
        pdr.close()
Exemplo n.º 12
0
def test_term_inspection_segment_reader():
    schema = fields.Schema(title=fields.TEXT(stored=True), content=fields.TEXT)
    with TempIndex(schema) as ix:
        with ix.writer() as w:
            w.add_document(
                title=u"My document",
                content=u"AA AA BB BB CC AA AA AA BB BB CC DD EE EE")
            w.add_document(title=u"My other document",
                           content=u"AA AÉ BB CC EE EE Aú AÚ DD")

        _check_inspection_results(ix)
Exemplo n.º 13
0
def test_stored_fields():
    codec = default_codec()
    fieldobj = fields.TEXT(stored=True)
    with TempStorage("storedfields") as st:
        seg = codec.new_segment(st, "test")

        dw = codec.per_document_writer(st, seg)
        dw.start_doc(0)
        dw.add_field("a", fieldobj, "hello", 1)
        dw.add_field("b", fieldobj, "there", 1)
        dw.finish_doc()

        dw.start_doc(1)
        dw.add_field("a", fieldobj, "one", 1)
        dw.add_field("b", fieldobj, "two", 1)
        dw.add_field("c", fieldobj, "three", 1)
        dw.finish_doc()

        dw.start_doc(2)
        dw.finish_doc()

        dw.start_doc(3)
        dw.add_field("a", fieldobj, "alfa", 1)
        dw.add_field("b", fieldobj, "bravo", 1)
        dw.finish_doc()

        dw.close()

        dr = codec.stored_fields_reader(st, seg)
        assert_equal(dr[0], {"a": "hello", "b": "there"})
        # Note: access out of order
        assert_equal(dr[3], {"a": "alfa", "b": "bravo"})
        assert_equal(dr[1], {"a": "one", "b": "two", "c": "three"})
        dr.close()

        dr = codec.stored_fields_reader(st, seg)
        sfs = list(dr)
        assert_equal(sfs, [
            {
                "a": "hello",
                "b": "there"
            },
            {
                "a": "one",
                "b": "two",
                "c": "three"
            },
            {},
            {
                "a": "alfa",
                "b": "bravo"
            },
        ])
        dr.close()
Exemplo n.º 14
0
def test_creation():
    s = fields.Schema(content=fields.TEXT(phrase=True),
                      title=fields.TEXT(stored=True),
                      path=fields.ID(stored=True),
                      tags=fields.KEYWORD(stored=True),
                      quick=fields.NGRAM,
                      note=fields.STORED)
    st = RamStorage()

    ix = st.create_index(s)
    w = ix.writer()
    w.add_document(title=u("First"), content=u("This is the first document"),
                   path=u("/a"), tags=u("first second third"),
                   quick=u("First document"),
                   note=u("This is the first document"))
    w.add_document(content=u("Let's try this again"), title=u("Second"),
                   path=u("/b"), tags=u("Uno Dos Tres"),
                   quick=u("Second document"),
                   note=u("This is the second document"))
    w.commit()
class TweetSchema(fields.SchemaClass):
    id = fields.ID(stored=True, unique=True)
    url = fields.ID(stored=True, unique=True)

    text = fields.TEXT(stored=True)
    source = fields.TEXT(stored=True)

    reply = fields.BOOLEAN(stored=True)
    in_reply_to_id = fields.TEXT(stored=True)
    in_reply_to_name = fields.TEXT(stored=True)

    user_mentions = fields.KEYWORD(stored=True)
    hashtags = fields.KEYWORD(stored=True)
    urls = fields.KEYWORD(stored=True)

    geo = fields.BOOLEAN(stored=True)
    latitude = fields.NUMERIC(stored=True)
    longitude = fields.NUMERIC(stored=True)

    date = fields.DATETIME(stored=True)
Exemplo n.º 16
0
    def test_creation1(self):
        s = fields.Schema()
        s.add("content", fields.TEXT(phrase=True))
        s.add("title", fields.TEXT(stored=True))
        s.add("path", fields.ID(stored=True))
        s.add("tags", fields.KEYWORD(stored=True))
        s.add("quick", fields.NGRAM)
        s.add("note", fields.STORED)

        self.assertEqual(s.field_names(),
                         ["content", "title", "path", "tags", "quick", "note"])
        self.assert_("content" in s)
        self.assertFalse("buzz" in s)
        self.assert_(isinstance(s["tags"], fields.KEYWORD))
        self.assert_(isinstance(s[3], fields.KEYWORD))
        self.assert_(s[0] is s.field_by_number(0))
        self.assert_(s["title"] is s.field_by_name("title"))
        self.assert_(s.name_to_number("path") == 2)
        self.assert_(s.number_to_name(4) == "quick")
        self.assertEqual(s.scorable_fields(), [0, 1, 4])
Exemplo n.º 17
0
def test_fuzzy_prefix():
    from whoosh import scoring

    schema = fields.Schema(title=fields.TEXT(stored=True),
                           content=fields.TEXT(spelling=True))

    ix = RamStorage().create_index(schema)
    with ix.writer() as w:
        # Match -> first
        w.add_document(title=u("First"),
                       content=u("This is the first document we've added!"))
        # No match
        w.add_document(
            title=u("Second"),
            content=u("The second one is even more interesting! filst"))
        # Match -> first
        w.add_document(title=u("Third"),
                       content=u("The world first line we've added!"))
        # Match -> zeroth
        w.add_document(
            title=u("Fourth"),
            content=u("The second one is alaways comes after zeroth!"))
        # Match -> fire is within 2 edits (transpose + delete) of first
        w.add_document(title=u("Fifth"), content=u("The fire is beautiful"))

    from whoosh.qparser import QueryParser, FuzzyTermPlugin
    parser = QueryParser("content", ix.schema)
    parser.add_plugin(FuzzyTermPlugin())
    q = parser.parse("first~2/3 OR zeroth", debug=False)

    assert isinstance(q, query.Or)
    ft = q[0]
    assert isinstance(ft, query.FuzzyTerm)
    assert ft.maxdist == 2
    assert ft.prefixlength == 3

    with ix.searcher(weighting=scoring.TF_IDF()) as searcher:
        results = searcher.search(q)
        assert len(results) == 4
        assert (" ".join(sorted(
            hit["title"] for hit in results)) == "Fifth First Fourth Third")
Exemplo n.º 18
0
def test_multi_language():
    # Analyzer for English
    ana_eng = analysis.StemmingAnalyzer()

    # analyzer for Pig Latin
    def stem_piglatin(w):
        if w.endswith("ay"):
            w = w[:-2]
        return w

    ana_pig = analysis.StemmingAnalyzer(stoplist=["nday", "roay"],
                                        stemfn=stem_piglatin)

    # Dictionary mapping languages to analyzers
    analyzers = {"eng": ana_eng, "pig": ana_pig}

    # Fake documents
    corpus = [(u("eng"), u("Such stuff as dreams are made on")),
              (u("pig"), u("Otay ebay, roay otnay otay ebay"))]

    schema = fields.Schema(content=fields.TEXT(stored=True),
                           lang=fields.ID(stored=True))
    ix = RamStorage().create_index(schema)

    with ix.writer() as w:
        for doclang, content in corpus:
            ana = analyzers[doclang]
            # "Pre-analyze" the field into token strings
            words = [token.text for token in ana(content)]
            # Note we store the original value but index the pre-analyzed words
            w.add_document(lang=doclang,
                           content=words,
                           _stored_content=content)

    with ix.searcher() as s:
        schema = s.schema

        # Modify the schema to fake the correct analyzer for the language
        # we're searching in
        schema["content"].analyzer = analyzers["eng"]

        qp = qparser.QueryParser("content", schema)
        q = qp.parse("dreaming")
        r = s.search(q)
        assert len(r) == 1
        assert r[0]["content"] == "Such stuff as dreams are made on"

        schema["content"].analyzer = analyzers["pig"]
        qp = qparser.QueryParser("content", schema)
        q = qp.parse("otnay")
        r = s.search(q)
        assert len(r) == 1
        assert r[0]["content"] == "Otay ebay, roay otnay otay ebay"
Exemplo n.º 19
0
def test_term_inspection():
    schema = fields.Schema(title=fields.TEXT(stored=True), content=fields.TEXT)
    with TempIndex(schema) as ix:
        with ix.writer() as w:
            w.add_document(
                title=u("My document"),
                content=u("AA AA BB BB CC AA AA AA BB BB CC DD EE EE"))
            w.add_document(title=u("My other document"),
                           content=u("AA AB BB CC EE EE AX AX DD"))

        with ix.reader() as r:
            cterms = " ".join(r.field_terms("content"))
            assert cterms == "aa ab ax bb cc dd ee"

            a_exp = list(r.expand_prefix("content", "a"))
            assert a_exp == [b('aa'), b('ab'), b('ax')]

            assert set(r.all_terms()) == set([('content', b('aa')),
                                              ('content', b('ab')),
                                              ('content', b('ax')),
                                              ('content', b('bb')),
                                              ('content', b('cc')),
                                              ('content', b('dd')),
                                              ('content', b('ee')),
                                              ('title', b('document')),
                                              ('title', b('my')),
                                              ('title', b('other'))])

            # (text, doc_freq, index_freq)
            cstats = _fstats(r.iter_field("content"))
            assert cstats == [
                (b('aa'), 2, 6), (b('ab'), 1, 1), (b('ax'), 1, 2),
                (b('bb'), 2, 5), (b('cc'), 2, 3), (b('dd'), 2, 2),
                (b('ee'), 2, 4)
            ]

            prestats = _fstats(r.iter_field("content", prefix="c"))
            assert prestats == [(b('cc'), 2, 3), (b('dd'), 2, 2),
                                (b('ee'), 2, 4)]

            assert list(r.most_frequent_terms("content")) == [(6, b('aa')),
                                                              (5, b('bb')),
                                                              (4, b('ee')),
                                                              (3, b('cc')),
                                                              (2, b('dd'))]
            assert list(r.most_frequent_terms("content",
                                              prefix="a")) == [(6, b('aa')),
                                                               (2, b('ax')),
                                                               (1, b('ab'))]
            assert list(r.most_distinctive_terms("content", 3)) == [
                (1.3862943611198906, b('ax')), (0.6931471805599453, b('ab')),
                (0.0, b('ee'))
            ]
def test_multitoken_phrase():
    textfield = fields.TEXT()
    textfield.multitoken_query = "phrase"
    schema = fields.Schema(text=textfield)
    parser = default.QueryParser("text", schema)
    qstring = u("chaw-bacon")

    texts = list(schema["text"].process_text(qstring))
    assert texts == ["chaw", "bacon"]

    q = parser.parse(qstring)
    assert q.__class__ == query.Phrase
Exemplo n.º 21
0
def test_bypass_stemming():
    ana = analysis.StemmingAnalyzer()
    schema = fields.Schema(text=fields.TEXT(analyzer=ana, spelling=True))
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    w.add_document(text=u("rendering shading modeling reactions"))
    w.commit()

    with ix.reader() as r:
        assert_equal(list(r.lexicon("text")),
                     ["model", "reaction", "render", "shade"])
        assert_equal(list(r.word_graph("text").flatten_strings()),
                     ["modeling", "reactions", "rendering", "shading"])
Exemplo n.º 22
0
    def build_index(self):
        schema = fields.Schema(keyword=fields.TEXT(stored=True),
                               content=fields.TEXT(stored=True))

        if not os.path.exists(config.path_index):
            os.mkdir(config.path_index)
        else:
            shutil.rmtree(config.path_index)
            os.mkdir(config.path_index)

        index.create_in(config.path_index, schema)
        ix = index.open_dir(config.path_index)

        writer = ix.writer()

        with open(config.path_dirty_talk, 'r') as fr:
            for line in fr:
                for word in jieba_tool.cut(line):
                    print(word)
                    writer.add_document(keyword=word, content=line.strip())

        writer.commit()
Exemplo n.º 23
0
def test_docwriter_one():
    field = fields.TEXT(stored=True)
    st, codec, seg = _make_codec()
    dw = codec.per_document_writer(st, seg)
    dw.start_doc(0)
    dw.add_field("text", field, "Testing one two three", 4)
    dw.finish_doc()
    dw.close()
    seg.set_doc_count(1)

    pdr = codec.per_document_reader(st, seg)
    assert pdr.doc_field_length(0, "text") == 4
    assert pdr.stored_fields(0) == {"text": "Testing one two three"}
Exemplo n.º 24
0
def test_groupby_phrase():
    domain = {"Alan Ball": "Tel Aviv", "Alan Charles": "San Francisco",
              "Alan Darwin": "London", "Alan Eames": "Paris"}

    schema = fields.Schema(name=fields.TEXT(stored=True),
                           city=fields.TEXT(stored=True),
                           city_g=fields.ID(stored=True))
    ix = RamStorage().create_index(schema)
    with ix.writer() as w:
        for name, city in domain.items():
            w.add_document(name=u(name), city=u(city), city_g=u(city))

    with ix.searcher() as s:
        q = query.Term("name", "alan")
        r = s.search(q, groupedby="city_g")
        keys = sorted(r.groups().keys())
        assert keys == ["London", "Paris", "San Francisco", "Tel Aviv"]

        sff = sorting.StoredFieldFacet("city")
        r = s.search(q, groupedby=sff)
        keys = sorted(r.groups().keys())
        assert keys == ["London", "Paris", "San Francisco", "Tel Aviv"]
Exemplo n.º 25
0
    def _mk_schema(self, dsinfo):
        from whoosh import fields as wf
        from whoosh.analysis import StandardAnalyzer

        # TODO support some customizable mapping to homogenize some metadata fields
        # onto a given set of index keys
        self.schema = wf.Schema(id=wf.ID,
                                path=wf.ID(stored=True),
                                type=wf.ID(stored=True),
                                parentds=wf.ID(stored=True),
                                meta=wf.TEXT(
                                    stored=False,
                                    analyzer=StandardAnalyzer(minsize=2)))
def test_score_retrieval():
    schema = fields.Schema(title=fields.TEXT(stored=True),
                           content=fields.TEXT(stored=True))
    storage = RamStorage()
    ix = storage.create_index(schema)
    writer = ix.writer()
    writer.add_document(title=u("Miss Mary"),
                        content=u("Mary had a little white lamb its fleece"
                                  " was white as snow"))
    writer.add_document(title=u("Snow White"),
                        content=u("Snow white lived in the forest with seven"
                                  " dwarfs"))
    writer.commit()

    with ix.searcher() as s:
        results = s.search(query.Term("content", "white"))
        assert len(results) == 2
        assert results[0]['title'] == u("Miss Mary")
        assert results[1]['title'] == u("Snow White")
        assert results.score(0) is not None
        assert results.score(0) != 0
        assert results.score(0) != 1
Exemplo n.º 27
0
    def test_missing_field_scoring(self):
        schema = fields.Schema(name=fields.TEXT(stored=True),
                               hobbies=fields.TEXT(stored=True))
        storage = store.RamStorage()
        idx = index.Index(storage, schema, create=True)
        writer = idx.writer()
        writer.add_document(name=u'Frank', hobbies=u'baseball, basketball')
        writer.commit()
        self.assertEqual(idx.segments[0].field_length(0), 2)  # hobbies
        self.assertEqual(idx.segments[0].field_length(1), 1)  # name

        writer = idx.writer()
        writer.add_document(name=u'Jonny')
        writer.commit()
        self.assertEqual(len(idx.segments), 1)
        self.assertEqual(idx.segments[0].field_length(0), 2)  # hobbies
        self.assertEqual(idx.segments[0].field_length(1), 2)  # name

        parser = qparser.MultifieldParser(['name', 'hobbies'], schema=schema)
        searcher = idx.searcher()
        result = searcher.search(parser.parse(u'baseball'))
        self.assertEqual(len(result), 1)
Exemplo n.º 28
0
def test_spellable_list():
    # Make sure a spellable field works with a list of pre-analyzed tokens

    ana = analysis.StemmingAnalyzer()
    schema = fields.Schema(Location=fields.STORED,Lang=fields.STORED,
                           Title=fields.TEXT(spelling=True, analyzer=ana))
    ix = RamStorage().create_index(schema)

    doc = {'Location': '1000/123', 'Lang': 'E',
           'Title': ['Introduction', 'Numerical', 'Analysis']}

    with ix.writer() as w:
        w.add_document(**doc)
Exemplo n.º 29
0
def _create_data_index():
    schema = whoosh_fields.Schema(
        url=whoosh_fields.ID(stored=True, unique=True),
        type=whoosh_fields.STORED(),
        title=whoosh_fields.STORED(),
        description=whoosh_fields.STORED(),
        org=whoosh_fields.STORED(),
        subtype=whoosh_fields.STORED(),
        content=whoosh_fields.TEXT(),
    )
    _ensure_dir(DATA_DIR)
    assert not whoosh_index.exists_in(DATA_DIR), DATA_DIR
    return whoosh_index.create_in(DATA_DIR, schema)
def test_nonexistant_fieldnames():
    # Need an analyzer that won't mangle a URL
    a = analysis.SimpleAnalyzer("\\S+")
    schema = fields.Schema(id=fields.ID, text=fields.TEXT(analyzer=a))

    qp = default.QueryParser("text", schema)
    q = qp.parse(u("id:/code http://localhost/"))
    assert q.__class__ == query.And
    assert q[0].__class__ == query.Term
    assert q[0].fieldname == "id"
    assert q[0].text == "/code"
    assert q[1].__class__ == query.Term
    assert q[1].fieldname == "text"
    assert q[1].text == "http://localhost/"