예제 #1
0
  def test_build_attrs(self):
    schema = Schema()
    adapter = SAAdapter(SANotIndexable, schema)
    self.assertEquals(adapter.indexable, False)
    self.assertEquals(adapter.doc_attrs, {})

    adapter = SAAdapter(Entity, schema)
    self.assertEquals(adapter.indexable, False)

    adapter = SAAdapter(SubclassEntityIndexable, schema)
    self.assertEquals(adapter.indexable, True)
    self.assertEquals(set(adapter.doc_attrs),
                      set(('object_key', 'id', 'name', 'object_type',
                           'text', 'created_at', 'updated_at', 'name_prefix',
                           'owner', 'owner_name', 'creator_name', 'creator')))
    self.assert_(all(lambda f: callable(f) for f in adapter.doc_attrs.itervalues()))

    self.assertEquals(set(schema.names()),
                      set(('object_key', 'id', 'object_type', 'name',
                           'text', 'created_at', 'updated_at', 'name_prefix',
                           'owner', 'owner_name', 'creator_name', 'creator')))

    schema = Schema(
      id=NUMERIC(numtype=int, bits=64, signed=False, stored=True, unique=True),
    )
    adapter = SAAdapter(Indexable, schema)
    self.assertEquals(adapter.indexable, True)
    self.assertEquals(set(adapter.doc_attrs),
                      set(('id', 'text', 'num', 'name')))
    self.assert_(all(lambda f: callable(f) for f in adapter.doc_attrs.itervalues()))

    self.assertEquals(set(schema.names()),
                      set(('id', 'text', 'num', 'name')))
    self.assertTrue(isinstance(schema['text'], TEXT))
    self.assertTrue(isinstance(schema['num'], NUMERIC))
예제 #2
0
def test_build_attrs_3():
    schema = Schema()
    adapter = SAAdapter(SubclassEntityIndexable, schema)
    assert adapter.indexable
    assert set(adapter.doc_attrs) == {
        "allowed_roles_and_users",
        "created_at",
        "creator",
        "creator_name",
        "id",
        "name",
        "name_prefix",
        "object_key",
        "object_type",
        "owner",
        "owner_name",
        "slug",
        "tag_ids",
        "tag_text",
        "text",
        "updated_at",
    }
    assert all(lambda f: callable(f) for f in adapter.doc_attrs.values())

    assert set(schema.names()) == {
        "allowed_roles_and_users",
        "created_at",
        "creator",
        "creator_name",
        "id",
        "name",
        "name_prefix",
        "object_key",
        "object_type",
        "owner",
        "owner_name",
        "slug",
        "tag_ids",
        "tag_text",
        "text",
        "updated_at",
    }
예제 #3
0
    def __init__(self):

        chfilter = CharsetFilter(accent_map)
        stoplist = stoplists["en"].union(stoplists["fr"])
        analyzer = RegexTokenizer() | LowercaseFilter() | \
            StopFilter(stoplist=stoplist) | chfilter

        # defines the schema
        # see http://pythonhosted.org/Whoosh/schema.html for reference
        keywordType = KEYWORD(lowercase=True, scorable=True)
        self.schema = Schema(content=TEXT(analyzer=analyzer),
                             docType=TEXT,
                             docId=ID(stored=True, unique=True),
                             tags=keywordType)

        # Adds dynamic fields so each documents can index its fields in the
        # same Whoosh index
        self.schema.add('*_string', TEXT(analyzer=analyzer), glob=True)
        self.schema.add('*_date', DATETIME, glob=True)
        self.schema.add('*_number', NUMERIC, glob=True)
        self.schema.add('*_boolean', BOOLEAN, glob=True)

        # Creates the index folder and Whoosh index files if it doesn't exist
        # And loads the index in any case
        if not os.path.exists("indexes"):
            os.mkdir("indexes")
            self.index = index.create_in("indexes", self.schema)
        else:
            self.index = index.open_dir("indexes")

        # Creates the doctypes folder if it doesn't exist
        if not os.path.exists("doctypes"):
            os.mkdir("doctypes")

        # Creates the doctypes default schema file if it doesn't exist
        if not os.path.exists('doctypes/doctypes_schema.json'):
            with open('doctypes/doctypes_schema.json', 'w') as defaultFile:
                defaultFile.write("{}")

        '''
        Loads the doctypes schema if it's valid, otherwise recreates it
        Doctypes schema is a dictionary of doctypes with their fields created
        and updated when a document is indexed.
        That way, we can tell Whoosh which fields to search by default, because
        there is apparently no way to say "search in all fields".
        '''
        with open('doctypes/doctypes_schema.json', 'r+') as rawJSON:
            try:
                self.doctypesSchema = json.load(rawJSON)
            except ValueError:
                rawJSON.write("{}")
                self.doctypesSchema = {}
예제 #4
0
  def test_build_attrs(self):
    schema = Schema()
    adapter = SAAdapter(SANotIndexable, schema)
    assert not adapter.indexable
    assert adapter.doc_attrs == {}

    adapter = SAAdapter(Entity, schema)
    assert adapter.indexable == False

    adapter = SAAdapter(SubclassEntityIndexable, schema)
    assert adapter.indexable
    assert set(adapter.doc_attrs) == {
      'object_key', 'id', 'name', 'slug', 'object_type',
      'text', 'created_at', 'updated_at', 'name_prefix',
      'owner', 'owner_name', 'creator_name', 'creator',
      'allowed_roles_and_users', 'tag_ids', 'tag_text',
    }
    assert all(lambda f: callable(f) for f in adapter.doc_attrs.itervalues())

    assert set(schema.names()) == {
      'object_key', 'id', 'object_type', 'name', 'slug',
      'text', 'created_at', 'updated_at', 'name_prefix',
      'owner', 'owner_name', 'creator_name', 'creator',
      'allowed_roles_and_users', 'tag_ids', 'tag_text',
    }

    schema = Schema(
      id=NUMERIC(numtype=int, bits=64, signed=False, stored=True, unique=True),
    )
    adapter = SAAdapter(Indexable, schema)
    assert adapter.indexable
    assert set(adapter.doc_attrs) == {'id', 'text', 'num', 'name'}
    assert all(lambda f: callable(f) for f in adapter.doc_attrs.itervalues())

    assert set(schema.names()) == {'id', 'text', 'num', 'name'}
    assert isinstance(schema['text'], TEXT)
    assert isinstance(schema['num'], NUMERIC)
예제 #5
0
def test_build_attrs_4():
    schema = Schema(id=NUMERIC(bits=64, signed=False, stored=True, unique=True))
    adapter = SAAdapter(Indexable, schema)
    assert adapter.indexable
    assert set(adapter.doc_attrs) == {
        "id",
        "text",
        "num",
        "name",
        "object_type",
        "object_key",
    }
    assert all(lambda f: callable(f) for f in adapter.doc_attrs.values())

    assert set(schema.names()) == {
        "id",
        "text",
        "num",
        "name",
        "object_type",
        "object_key",
    }
    assert isinstance(schema["text"], TEXT)
    assert isinstance(schema["num"], NUMERIC)
예제 #6
0
파일: store.py 프로젝트: leifj/pyFF
 def __init__(self):
     self.schema = Schema(scopes=KEYWORD(),
                          descr=TEXT(),
                          service_name=TEXT(),
                          service_descr=TEXT(),
                          keywords=KEYWORD())
     self.schema.add("object_id", ID(stored=True, unique=True))
     self.schema.add("entity_id", ID(stored=True, unique=True))
     for a in list(ATTRS.keys()):
         self.schema.add(a, KEYWORD())
     self._collections = set()
     from whoosh.filedb.filestore import RamStorage, FileStorage
     self.storage = RamStorage()
     self.storage.create()
     self.index = self.storage.create_index(self.schema)
     self.objects = dict()
     self.infos = dict()
예제 #7
0
from galaxy.eggs import require
from galaxy.web.framework.helpers import to_unicode
# Whoosh is compatible with Python 2.5+ Try to import Whoosh and set flag to indicate whether tool search is enabled.
try:
    require("Whoosh")

    from whoosh.filedb.filestore import RamStorage
    from whoosh.fields import Schema, STORED, ID, KEYWORD, TEXT
    from whoosh.index import Index
    from whoosh.scoring import BM25F
    from whoosh.qparser import MultifieldParser
    tool_search_enabled = True
    schema = Schema(id=STORED, title=TEXT, description=TEXT, help=TEXT)
except ImportError, e:
    tool_search_enabled = False
    schema = None


class ToolBoxSearch(object):
    """
    Support searching tools in a toolbox. This implementation uses
    the "whoosh" search library.
    """
    def __init__(self, toolbox):
        """
        Create a searcher for `toolbox`. 
        """
        self.toolbox = toolbox
        self.enabled = tool_search_enabled
        if tool_search_enabled:
            self.build_index()
예제 #8
0
# CUSTOM ANALYZER wordsplit + lowercase filter, for pathname-like text
#
# This is useful to:
# - avoid removing "stop words" from text
# - search case-insensitively
#
PATHANALYZER = RegexTokenizer() | LowercaseFilter()

# INDEX SCHEMA DEFINITION
SCHEMA = Schema(
    fileid=ID(unique=True),
    owner=TEXT(analyzer=EMAILADDRANALYZER),
    # this field preserves case of repository name for exact matching
    repository_rawname=TEXT(analyzer=IDANALYZER),
    repository=TEXT(stored=True, analyzer=ICASEIDANALYZER),
    path=TEXT(stored=True, analyzer=PATHANALYZER),
    content=FieldType(format=Characters(),
                      analyzer=ANALYZER,
                      scorable=True,
                      stored=True),
    modtime=STORED(),
    extension=TEXT(stored=True, analyzer=PATHANALYZER))

IDX_NAME = 'HG_INDEX'
FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')
FRAGMENTER = ContextFragmenter(200)

CHGSETS_SCHEMA = Schema(
    raw_id=ID(unique=True, stored=True),
    date=NUMERIC(stored=True),
    last=BOOLEAN(),
예제 #9
0
def make_index():
    schema = Schema(url=ID(stored=True), tags=KEYWORD)
    if not os.path.exists("index"):
        os.mkdir("index")
        ix = create_in("index", schema)
        return ix
예제 #10
0
def get_schema():
    return Schema(path=ID(unique=True, stored=True), content=TEXT)
예제 #11
0
from whoosh.qparser import MultifieldParser
from whoosh.query import And, Every, Term

from galaxy import exceptions
from galaxy.exceptions import ObjectNotFound
from galaxy.util.search import parse_filters

log = logging.getLogger(__name__)

schema = Schema(
    id=NUMERIC(stored=True),
    name=TEXT(field_boost=1.7, stored=True),
    description=TEXT(field_boost=1.5, stored=True),
    long_description=TEXT(stored=True),
    homepage_url=TEXT(stored=True),
    remote_repository_url=TEXT(stored=True),
    repo_owner_username=TEXT(stored=True),
    categories=KEYWORD(stored=True, commas=True, scorable=True),
    times_downloaded=STORED,
    approved=STORED,
    last_updated=STORED,
    repo_lineage=STORED,
    full_last_updated=STORED)


class RepoWeighting(scoring.BM25F):
    """
    Affect the BM25G scoring model through the final method.
    source: https://groups.google.com/forum/#!msg/whoosh/1AKNbW8R_l8/XySW0OecH6gJ
    """
    use_final = True
예제 #12
0
    def searchTags(self, tags=[]):
        """ Search tags in merged pdf file

        :param tags: List of search tags e.g. ['Introduction', 'Experiment']
        :type tags: list
        """
        # Update tags
        if not tags:
            tags = self.props['tags']

        # Create custom FuzzyTerm for fuzzy tag search
        class CustomFuzzyTerm(FuzzyTerm):
            def __init__(self,
                         fieldname,
                         text,
                         boost=1.0,
                         maxdist=self.props['maxdist'],
                         prefixlength=self.props['prefixlength'],
                         constantscore=True):
                super(CustomFuzzyTerm,
                      self).__init__(fieldname, text, boost, maxdist,
                                     prefixlength, constantscore)

        # Create teporary directory tmpdir
        if not os.path.exists("tmpdir"):
            os.mkdir("tmpdir")

        schema = Schema(title=TEXT(stored=True),
                        path=ID(stored=True),
                        content=TEXT(stored=True))
        ix = index.create_in("tmpdir", schema)
        writer = ix.writer()
        for i, ltObj in enumerate(self.ltObjList):
            #writer.add_document(title=str(i), content=str(ltObj['OBJ'].get_text()), path=u"/a")
            #st=str(ltObj['OBJ'].get_text())
            st = ltObj['OBJ'].get_text().encode('utf-8')
            st = st.decode('utf-8')
            sti = str(i)
            writer.add_document(title=sti, content=st)
        writer.commit()

        results = []
        id = 0
        for tag in tags:
            print('Searching tag: ', tag)
            with ix.searcher() as searcher:
                #query = QueryParser("content", ix.schema).parse(tag)

                qp = QueryParser("content",
                                 schema=ix.schema,
                                 termclass=CustomFuzzyTerm)
                query = qp.parse(tag + '~4/4')

                res = searcher.search(query, limit=None)
                for hit in res:
                    results.append(
                        dict({
                            'ID':
                            id,
                            'DOCNAME':
                            self.ltObjList[hit.docnum]['DOCNAME'],
                            'TAG':
                            tag,
                            'PAGE':
                            self.ltObjList[hit.docnum]['PAGE'],
                            'HITNUM':
                            hit.docnum,
                            'BBOX':
                            self.ltObjList[hit.docnum]['OBJ'].bbox,
                            'TEXT':
                            self.ltObjList[hit.docnum]['OBJ'].get_text()
                        }))
                    id += 1
        self.results = results

        return results
예제 #13
0
import unicodecsv as csv
from whoosh import index, sorting
from whoosh.analysis import StandardAnalyzer
from whoosh.fields import Schema, STORED, NGRAMWORDS, NUMERIC
from whoosh.qparser import MultifieldParser

_schema = Schema(
    ror=STORED(),
    grid=STORED(),
    name=NGRAMWORDS(stored=False),
    aliases=NGRAMWORDS(stored=False),
    num_students=NUMERIC(int, sortable=True, stored=False),
    citation_score=NUMERIC(int, sortable=True, stored=False),
)

_index_path = 'data/ror-whoosh-index'


def _read_ror_csv_rows():
    rows = []
    with open('data/ror-metrics.csv') as ror_csv:
        reader = csv.DictReader(ror_csv)
        for row in reader:
            row['aliases'] = row['aliases'].split(
                u'###') if row['aliases'] else []
            row['num_students'] = int(
                row['num_students']) if row['num_students'] else None
            row['citation_score'] = float(
                row['citation_score']) if row['citation_score'] else None
            rows.append(row)
예제 #14
0
    def _build_doc_attrs(self, model_class: Type[Model],
                         schema: Schema) -> None:
        mapper = sa.inspect(model_class)

        args = self.doc_attrs
        # Any field not in schema will be stored here.
        # After all field have been discovered, we add the missing ones.
        field_definitions = {}

        def setup_field(
                attr_name: str, field_name: Union[Tuple[str, Union[type, ID]],
                                                  str]) -> None:
            field_def = False
            if not isinstance(field_name, str):
                field_name, field_def = field_name

            if field_name not in schema:
                if (field_name not in field_definitions
                        or field_definitions[field_name] is False):
                    field_definitions[field_name] = field_def

            # attrgetter offers dotted name support. Useful for attributes on
            # related objects.
            args.setdefault(field_name, {})[attr_name] = attrgetter(attr_name)

        # model level definitions
        for name, field_names in self.index_to:
            if isinstance(field_names, str):
                field_names = (field_names, )
            for field_name in field_names:
                setup_field(name, field_name)

        # per column definitions
        for col in mapper.columns:
            name = col.name
            info = col.info

            if not info.get("searchable"):
                continue

            index_to = info.get("index_to", (name, ))
            if isinstance(index_to, str):
                index_to = (index_to, )

            for field_name in index_to:
                setup_field(name, field_name)

        # add missing fields to schema
        for field_name, field_def in field_definitions.items():
            if field_name in schema:
                continue

            if field_def is False:
                field_def = TEXT(stored=True, analyzer=accent_folder)

            logger.debug(
                "Adding field to schema:\n"
                "  Model: %s\n"
                '  Field: "%s" %s',
                model_class._object_type(),
                field_name,
                field_def,
            )
            schema.add(field_name, field_def)
예제 #15
0
from whoosh.fields import Schema, TEXT, ID
import index_helper

coaches_index_dir = "coaches_index"

coaches_schema = Schema(
    Name=TEXT,
    CoachAPIID=ID(stored=True),
    TeamID=TEXT,
    WinLoss=TEXT,
    DOB=TEXT,
    Recognitions=TEXT,
    PastTeams=TEXT,
    PlayersCoached=TEXT,
)

coaches_attributes = [
    "Name", "CoachAPIID", "TeamID", "WinLoss", "DOB", "Recognitions",
    "PastTeams", "PlayersCoached"
]

coaches_id_name = "CoachAPIID"


def search_coach_index(query):
    return index_helper.search_index(coaches_index_dir, coaches_schema,
                                     coaches_attributes, coaches_id_name,
                                     query)


if __name__ == '__main__':
from whoosh import index, qparser
import json

from paths import here_path, merged_dir_path, top_dir
from movies import movies, WhichMovie, name_dict, movie_dict
from load_files import yarn_file_paths, parsed_scripts_file_paths, fandom_links_file_path

character_links = json.load(fandom_links_file_path.open('r', encoding='UTF-8'))
index_dir_path = here_path / "indexdir"

if not index_dir_path.exists():
	index_dir_path.mkdir()

schema = Schema(
	movie=NUMERIC(stored=True),
	character=NUMERIC(stored=True),
	quote=KEYWORD(stored=True),
)

ix = index.create_in(index_dir_path, schema)
writer = ix.writer()

print("Building index")

for script_file in parsed_scripts_file_paths:
	print(f'Building index for file "{script_file.relative_to(top_dir)}"')
	movie: WhichMovie = name_dict[script_file.stem]

	script_data = json.load(script_file.open('r', encoding="UTF-8"))
	print(f"Indexing ({movie}): ", end="")
	len_all_quotes = len(script_data['quotes'])
예제 #17
0
import lib.DatabaseLayer as db

argParser = argparse.ArgumentParser(description='Fulltext indexer for the MongoDB CVE collection')
argParser.add_argument('-v', action='store_true', default=False, help='Verbose logging')
argParser.add_argument('-l', default=5, help='Number of last entries to index (Default: 5) - 0 to index all documents')
argParser.add_argument('-n', action='store_true', default=False, help='lookup complete cpe (Common Platform Enumeration) name for vulnerable configuration to add in the index')
args = argParser.parse_args()

c = cves.last(namelookup=args.n)

indexpath = Configuration.getIndexdir()

from whoosh.index import create_in, exists_in, open_dir
from whoosh.fields import Schema, TEXT, ID

schema = Schema(title=TEXT(stored=True), path=ID(stored=True, unique=True), content=TEXT)

if not os.path.exists(indexpath):
    os.mkdir(indexpath)

if not exists_in(indexpath):
    ix = create_in(indexpath, schema)
else:
    ix = open_dir(indexpath)
def dumpallcveid(entry=None):
    return db.getCVEID if not entry else db.getCVEIDs(int(entry))


def getcve(cveid=None):
    if cveid is None:
        return False
예제 #18
0
    def build_index(self):
        """Build a `Whoosh <https://whoosh.readthedocs.io/en/latest/index.html>`_
        index for product types searches.

        .. versionadded:: 1.0
        """
        index_dir = os.path.join(self.conf_dir, ".index")

        # use eodag_version to help keeping index up-to-date
        eodag_version = self.get_version()

        create_index = not exists_in(index_dir)
        # check index version
        if not create_index:
            if self._product_types_index is None:
                logger.debug("Opening product types index in %s", index_dir)
                self._product_types_index = open_dir(index_dir)
            try:
                self.guess_product_type(eodagVersion=eodag_version)
            except NoMatchingProductType:
                create_index = True
            finally:
                if create_index:
                    shutil.rmtree(index_dir)
                    logger.debug(
                        "Out-of-date product types index removed from %s",
                        index_dir)

        if create_index:
            logger.debug("Creating product types index in %s", index_dir)
            makedirs(index_dir)
            product_types_schema = Schema(
                ID=fields.STORED,
                abstract=fields.TEXT,
                instrument=fields.IDLIST,
                platform=fields.ID,
                platformSerialIdentifier=fields.IDLIST,
                processingLevel=fields.ID,
                sensorType=fields.ID,
                eodagVersion=fields.ID,
                license=fields.ID,
                title=fields.ID,
                missionStartDate=fields.ID,
                missionEndDate=fields.ID,
            )
            non_indexable_fields = ["bands"]
            self._product_types_index = create_in(index_dir,
                                                  product_types_schema)
            ix_writer = self._product_types_index.writer()
            for product_type in self.list_product_types():
                versioned_product_type = dict(
                    product_type, **{"eodagVersion": eodag_version})
                # add to index
                ix_writer.add_document(
                    **{
                        k: v
                        for k, v in versioned_product_type.items()
                        if k not in non_indexable_fields
                    })
            ix_writer.commit()
        else:
            if self._product_types_index is None:
                logger.debug("Opening product types index in %s", index_dir)
                self._product_types_index = open_dir(index_dir)
예제 #19
0
#!/usr/bin/python3
# coding: utf-8
# whoosh 使用流程: 1. 创建 schema 2. 索引生成 3. 索引查询
# https://my.oschina.net/u/2351685/blog/603079
# https://github.com/fxsjy/jieba/blob/master/test/test_whoosh.py
from whoosh.fields import ID, TEXT, Schema
from whoosh.index import create_in
from whoosh.index import open_dir
from whoosh.qparser import QueryParser
from whoosh import qparser
# 执行前保证 ./tmp/ 目录存在, 否则会报错
##################################################################
## 0. 先写一个简洁的版本, 后面是讲解
from whoosh.index import create_in
from whoosh.fields import TEXT, ID, Schema  # 只引入这两个就够了
schema = Schema(title=TEXT(stored=True), content=TEXT)
ix = create_in('./tmp', schema)               # 存储 schema 信息至 ./tmp/; ** 这个只能执行一遍, 否则会报 LockError **
writer = ix.writer()                          # 按照 schema 定义信息, 增加需要建立索引的文档
writer.add_document(title='hello', content='hello world')
writer.add_document(title='world', content='world hello')
writer.commit()                               # searcher() 要写到 commit() 后面
searcher = ix.searcher()                      # 创建一个检索器; 最好用 with ix.searcher() as searcher: 来写, 这里只是为了方便
## 第一种检索方式:
print(searcher.find('content', 'hello world').fields(0))  # {'title': 'hello'}; TEXT 会存储位置信息, 支持短语检索
print(searcher.find('content', 'hello world')[1].fields())  # {'title': 'world'};
## 另一种检索方式: Construct query objects directly
from whoosh.query import *
myquery = And([Term("content", "hello"), Term("content", "world")])
print(searcher.search(myquery).fields(1))  # {'title': 'world'}; 和上面的结果一样
## 第三种检索方式: Parse a query string; 一般最好用这种方法做
from whoosh.qparser import QueryParser
예제 #20
0
    def build_schema(self, fields):
        schema_fields = {
            ID: WHOOSH_ID(stored=True, unique=True),
            DJANGO_CT: WHOOSH_ID(stored=True),
            DJANGO_ID: WHOOSH_ID(stored=True),
        }
        # Grab the number of keys that are hard-coded into Haystack.
        # We'll use this to (possibly) fail slightly more gracefully later.
        initial_key_count = len(schema_fields)
        content_field_name = ''

        for field_name, field_class in fields.items():
            if field_class.is_multivalued:
                if field_class.indexed is False:
                    schema_fields[field_class.index_fieldname] = IDLIST(
                        stored=True, field_boost=field_class.boost)
                else:
                    schema_fields[field_class.index_fieldname] = KEYWORD(
                        stored=True,
                        commas=True,
                        scorable=True,
                        field_boost=field_class.boost)
            elif field_class.field_type in ['date', 'datetime']:
                schema_fields[field_class.index_fieldname] = DATETIME(
                    stored=field_class.stored, sortable=True)
            elif field_class.field_type == 'integer':
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored,
                    numtype=int,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'float':
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored,
                    numtype=float,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'boolean':
                # Field boost isn't supported on BOOLEAN as of 1.8.2.
                schema_fields[field_class.index_fieldname] = BOOLEAN(
                    stored=field_class.stored)
            elif field_class.field_type == 'ngram':
                schema_fields[field_class.index_fieldname] = NGRAM(
                    minsize=3,
                    maxsize=15,
                    stored=field_class.stored,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'edge_ngram':
                schema_fields[field_class.index_fieldname] = NGRAMWORDS(
                    minsize=2,
                    maxsize=15,
                    at='start',
                    stored=field_class.stored,
                    field_boost=field_class.boost)
            else:
                schema_fields[field_class.index_fieldname] = TEXT(
                    stored=True,
                    analyzer=ChineseAnalyzer(),
                    field_boost=field_class.boost,
                    sortable=True)

            if field_class.document is True:
                content_field_name = field_class.index_fieldname
                schema_fields[field_class.index_fieldname].spelling = True

        # Fail more gracefully than relying on the backend to die if no fields
        # are found.
        if len(schema_fields) <= initial_key_count:
            raise SearchBackendError(
                "No fields were found in any search_indexes. Please correct this before attempting to search."
            )

        return (content_field_name, Schema(**schema_fields))
예제 #21
0
    def store_revision(self, meta, data, overwrite=False,
                       trusted=False, # True for loading a serialized representation or other trusted sources
                       name=None, # TODO name we decoded from URL path
                       action=u'SAVE',
                       remote_addr=None,
                       userid=None,
                       wikiname=None,
                       contenttype_current=None,
                       contenttype_guessed=None,
                       acl_parent=None,
                       ):
        """
        Store a revision into the backend, write metadata and data to it.

        Usually this will be a new revision, either of an existing item or
        a new item. With overwrite mode, we can also store over existing
        revisions.

        :type meta: dict
        :type data: open file (file must be closed by caller)
        :param overwrite: if True, allow overwriting of existing revs.
        :returns: a Revision instance of the just created revision
        """
        if remote_addr is None:
            try:
                # if we get here outside a request, this won't work:
                remote_addr = unicode(request.remote_addr)
            except:
                pass
        if userid is None:
            try:
                # if we get here outside a request, this won't work:
                userid = flaskg.user.valid and flaskg.user.itemid or None
            except:
                pass
        if wikiname is None:
            wikiname = app.cfg.interwikiname
        state = {'trusted': trusted,
                 keys.NAME: name,
                 keys.ACTION: action,
                 keys.ADDRESS: remote_addr,
                 keys.USERID: userid,
                 keys.WIKINAME: wikiname,
                 keys.ITEMID: self.itemid, # real itemid or None
                 'contenttype_current': contenttype_current,
                 'contenttype_guessed': contenttype_guessed,
                 'acl_parent': acl_parent,
                }
        ct = meta.get(keys.CONTENTTYPE)
        if ct == CONTENTTYPE_USER:
            Schema = UserMetaSchema
        else:
            Schema = ContentMetaSchema
        m = Schema(meta)
        valid = m.validate(state)
        # TODO: currently we just log validation results. in the end we should
        # reject invalid stuff in some comfortable way.
        if not valid:
            logging.warning("metadata validation failed, see below")
            for e in m.children:
                logging.warning("{0}, {1}".format(e.valid, e))

        # we do not have anything in m that is not defined in the schema,
        # e.g. userdefined meta keys or stuff we do not validate. thus, we
        # just update the meta dict with the validated stuff:
        meta.update(dict(m.value.items()))
        # we do not want None / empty values:
        meta = dict([(k, v) for k, v in meta.items() if v not in [None, []]])

        if self.itemid is None:
            self.itemid = meta[ITEMID]
        backend = self.backend
        if not overwrite:
            revid = meta.get(REVID)
            if revid is not None and revid in backend:
                raise ValueError('need overwrite=True to overwrite existing revisions')
        meta, data, content = self.preprocess(meta, data)
        data.seek(0)  # rewind file
        revid = backend.store(meta, data)
        meta[REVID] = revid
        self.indexer.index_revision(meta, content)
        if not overwrite:
            self._current = self.indexer._document(revid=revid)
        return Revision(self, revid)
예제 #22
0
from whoosh.fields import Schema, TEXT, ID
from whoosh.index import create_in, open_dir
from whoosh.query import *
from whoosh.qparser import QueryParser
from jieba.analyse import ChineseAnalyzer

# whoosh+jieba 初步学习样例

cnAnalyzer = ChineseAnalyzer()
schema = Schema(title=TEXT(stored=True, analyzer=cnAnalyzer),
                content=TEXT(stored=False, analyzer=cnAnalyzer),
                path=ID(stored=True))

import os.path
if not os.path.exists('sample_index'):
    os.mkdir('sample_index')
ix = create_in('sample_index', schema)
ix = open_dir('sample_index')

writer = ix.writer()
writer.add_document(title=u'爱吃大板的博客', content=u'大家好!这里是爱吃大板的博客,欢迎光临!大板是一种雪糕。')
writer.add_document(title=u'蝴蝶定理吃雪糕',
                    content=u'好阿婆雪糕蝴蝶定理最爱吃了!必须买下来。It\'s tasty!')
writer.commit()
writer = ix.writer()
writer.add_document(title=u"My document", content=u"This is my document!")
writer.add_document(title=u"Second try",
                    content=u"This is the second example.",
                    path='http://sdu.edu.cn/')
writer.add_document(title=u"Third time's the charm",
                    content=u"Examples are many.",
예제 #23
0
__author__ = 'rich'

import datetime
import os
import pandas as pd
from whoosh.fields import Schema
from whoosh.fields import TEXT, ID, DATETIME, KEYWORD
from whoosh.index import open_dir
from whoosh.index import create_in
from whoosh.query import Term, And, Or
from whoosh.qparser import QueryParser

my_schema = Schema(id=ID(unique=True, stored=True),
                   lang=TEXT(),
                   screenname=TEXT(),
                   tweettext=TEXT(),
                   hashtags=TEXT(),
                   datetime=DATETIME())

if not os.path.exists("tweets_index"):
    os.mkdir("tweets_index")
    index = create_in("tweets_index", my_schema)
index = open_dir("tweets_index")
writer = index.writer()

df = pd.read_csv('tweets/tweets.csv',
                 header=None,
                 names=[
                     'id', 'language', 'screenname', 'tweettext', 'hashtags',
                     'timestamp'
                 ])
예제 #24
0
import os

from whoosh import index
from whoosh.fields import Schema, ID, TEXT, NGRAM

# 인덱스 데이터를 저장할 디렉터리 지정하기
INDEX_DIR = "indexdir"

# 인덱스 전용 스키마 정의하기
schema = Schema(
    # 인덱스 유닛 ID로 글의 URL 사용하기
    post_url=ID(unique=True, stored=True),
    # 본문을 N그램으로 인덱스화
    body=NGRAM(stored=True),
)

def get_or_create_index():
    # 인덱스 전용 디렉터리가 없다면 만들기
    if not os.path.exists(INDEX_DIR):
        os.mkdir(INDEX_DIR)
        # 인덱스 전용 파일 만들기
        ix = index.create_in(INDEX_DIR, schema)
        return ix
        
    # 이미 인덱스 전용 디렉터리가 있는 경우
    # 기존의 인덱스 파일 열어서 사용하기
    ix = index.open_dir(INDEX_DIR)
    return ix
예제 #25
0
"""
import ConfigParser
import os
import sys

sys.path.insert(
    1,
    os.path.abspath(
        os.path.join(os.path.dirname(__file__), os.pardir, os.pardir, 'lib')))

# Whoosh is compatible with Python 2.5+ Try to import Whoosh and set flag to indicate whether search is enabled.
try:
    from whoosh.filedb.filestore import FileStorage
    from whoosh.fields import Schema, STORED, TEXT
    whoosh_search_enabled = True
    schema = Schema(id=STORED, name=TEXT, info=TEXT, dbkey=TEXT, message=TEXT)
    import galaxy.model.mapping
    from galaxy import config, model
except ImportError, e:
    whoosh_search_enabled = False
    schema = None


def build_index(sa_session, whoosh_index_dir):
    storage = FileStorage(whoosh_index_dir)
    index = storage.create_index(schema)
    writer = index.writer()

    def to_unicode(a_basestr):
        if type(a_basestr) is str:
            return unicode(a_basestr, 'utf-8')
예제 #26
0
import os
import whoosh
import codecs
from whoosh.fields import Schema
from whoosh.index import create_in
from mappers import inputdatastream
from whoosh.fields import ID, KEYWORD, TEXT

FILEPATH = "/Users/denisvrdoljak/Berkeley/W205/Asn4_Work/ WC2015-2testing.csv"


#schema setup
my_schema = Schema(id = ID(unique=True, stored=True), 
                    path = ID(stored=True), 
                    tagsearch = ID(stored=True),
                    tags = TEXT(stored=True), 
                    date = TEXT(stored=True),
                    hour = TEXT(stored=True),
                    tweet = TEXT(stored=True))


#enter data
writer = index.writer()
os.mkdir("twitterwwc-index")
index = create_in("wwc-index1", my_schema)

for i,line in enumerate(inputdatastream(FILEPATH)):
    print ".",
    writer.add_document( path = FILEPATH.encode("utf-8"),
                    tagsearch = line.split(",")[4].encode("utf-8"),
                    tags = [word for word in line.split(",")[0] if '#' in word], 
예제 #27
0
from whoosh.support.charset import accent_map
from whoosh.analysis import StemmingAnalyzer, StandardAnalyzer, CharsetFilter
from flask_sqlalchemy import SQLAlchemy
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from geoalchemy2 import Geometry
from shapely import wkb
from json import loads
from shutil import rmtree

from ambiente import geocode_db, whoosh_base

db = SQLAlchemy()
analyzer = StemmingAnalyzer() | CharsetFilter(accent_map)
schema = Schema(id=NUMERIC,
                nome=TEXT(analyzer=analyzer, stored=True),
                geom=STORED)


class Poligono(db.Model):
    __tablename__ = 'openLS_localizacaopoligono'
    __searchable__ = ['nome']
    id = db.Column(db.Integer, primary_key=True)
    nome = db.Column(db.Unicode)
    geom = db.Column(Geometry('POLYGON'))


class Linha(db.Model):
    __tablename__ = 'openLS_localizacaolinha'
    __searchable__ = ['nome']
    id = db.Column(db.Integer, primary_key=True)
예제 #28
0
import re, os, codecs
import progressbar


def visible(element):
    if element.parent.name in [
            'style', 'script', '[document]', 'head', 'title'
    ]:
        return False
    elif re.match('<!--.*-->', element.encode('utf-8')):
        return False
    return True


dir = os.listdir('dataset')
schema = Schema(content=TEXT(stored=True))
ix = create_in("database", schema)

with progressbar.ProgressBar(maxval=21890,
                             widgets=[
                                 ' [',
                                 progressbar.Timer(), '][',
                                 progressbar.ETA(), '][',
                                 progressbar.Percentage(), ']',
                                 progressbar.Bar('=', '[', '] '),
                                 progressbar.Counter()
                             ]) as bar:

    for i, l in enumerate(dir):
        bar.update(i + 1)
        # print l
예제 #29
0
파일: Index.py 프로젝트: elfdown/ee208
import os.path
from whoosh.fields import Schema, STORED, ID, KEYWORD, TEXT
from whoosh.index import create_in
from jieba.analyse import ChineseAnalyzer

if __name__ == "__main__":
    schema_doc = Schema(title=TEXT(stored=True, sortable=True),
                        content=TEXT(stored=True,
                                     sortable=True,
                                     analyzer=ChineseAnalyzer()),
                        url=ID(stored=True))

    if not os.path.exists("index_doc"):
        os.mkdir("index_doc")

    create_in("index_doc", schema_doc)

    schema_img = Schema(title=TEXT(stored=True, sortable=True),
                        content=TEXT(stored=True,
                                     sortable=True,
                                     analyzer=ChineseAnalyzer()),
                        src=ID(stored=True),
                        source=ID(stored=True))

    if not os.path.exists("index_img"):
        os.mkdir("index_img")

    create_in("index_img", schema_img)
예제 #30
0
# coding:utf-8
# 因为之前的项目里面用到了whoosh,但是当时没有太多的耐心去看这个,这里深入的整理一下。
# whoosh是一个类似于博客的快速搜索项目,它可以实现快速的检索你想要的内容。
# 首先需要先安装whoosh. pip2 install whoosh
# quick_start
from whoosh.index import create_in
from whoosh.fields import *
from whoosh.qparser import QueryParser

schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT)
ix = create_in("indexdir", schema)  # 创建了一个对象,indexdir
writer = ix.writer()  # 一个写入对象
writer.add_document(
    title=u'First document',
    path=u"/a",
    content=u"This is the first document we've added!")  # 添加文档,里面是内容和路径以及文件名称。
writer.add_document(title=u'Second document',
                    path=u"/b",
                    content=u"The second one is even more interesting! "
                    )  # 添加文档,里面是内容和路径以及文件名称。
writer.commit()  # 提交要添加的内容
with ix.searcher() as searcher:
    query = QueryParser("content", ix.schema).parse(
        'first')  # 这里就会查找内容中包含first的, 类似re.compile
    results = searcher.search(query)  # 这里才是真正的执行查询操作
    print(results[0])
# result:   {"title": u"First document", "path": u"/a"}
# 上面这段代码并不能运行,运行会触发报错,但是可以简单的看到一些使用方法。

# 这些完全是根据现有经验推断出来的,实际情况,还需要看文档。
예제 #31
0
#untuk fetching
from bs4 import BeautifulSoup
import urllib.request
#untuk create schema
import os, os.path
import whoosh.index as index
from whoosh.fields import Schema, TEXT, ID
#untuk search
from whoosh.qparser import QueryParser

#creating schema and indexing
schema = Schema(title=ID(stored=True), content=TEXT(stored=True))

if not os.path.exists("index"):
    os.mkdir("index")

ix = index.create_in("index", schema)
ix = index.open_dir("index")

writer = ix.writer()

lis_link = list()
lis_link.append("http://pythonforbeginners.com")
lis_link.append("http://www.python.org")
lis_link.append("https://docs.microsoft.com/en-us/dotnet/csharp/")
lis_link.append("https://www.tutorialspoint.com/cplusplus/index.htm")
for i in range(len(lis_link)):
    html_page = urllib.request.urlopen(lis_link[i])
    soup = BeautifulSoup(html_page, 'html.parser')
    writer.add_document(title=lis_link[i], content=soup.prettify())
    print("success add document " + str(i + 1))
예제 #32
0
class WhooshEngine(BaseEngine):
    # whoosh
    schema = Schema(title=TEXT(stored=True),
                    path=TEXT(stored=True),
                    href=ID(stored=True),
                    cfiBase=TEXT(stored=True),
                    spinePos=TEXT(stored=True),
                    content=TEXT)

    def open(self):
        try:
            self.ix = index.open_dir(self.database_path)
        except Exception as e:
            logger.error("openning database {} failed".format(
                self.database_name))

    def create(self):

        if not os.path.exists(self.database_path):
            os.mkdir(self.database_path)

        try:
            logger.debug("openning database {} to create".format(
                self.database_name))
            self.ix = index.create_in(self.database_path, self.schema)
        except Exception as e:
            logger.error(e)

        self.writer = self.ix.writer()

    def add(self, path='', href='', title='', cfiBase='', spinePos=''):
        text = self.__get_text(path)
        self.writer.add_document(title=str(title),
                                 path=str(path),
                                 href=str(href),
                                 cfiBase=str(cfiBase),
                                 spinePos=str(spinePos),
                                 content=str(text))
        logger.debug("Indexed: " + title + ' | ' + path + ' | ' + href +
                     ' | ' + str(spinePos))

    def finished(self):
        self.writer.commit()

    def query(self, q, limit=None):
        logger.debug('Q {}'.format(q))
        with self.ix.searcher() as searcher:
            results = []
            parsed_query = QueryParser("content",
                                       schema=self.ix.schema).parse(q)
            hits = searcher.search(parsed_query, limit=limit)
            logger.debug("Hits {}".format(hits))
            for hit in hits:
                item = {}
                item['title'] = hit["title"].encode("utf-8")
                item['href'] = hit["href"].encode("utf-8")
                item['path'] = hit["path"].encode("utf-8")
                item['cfiBase'] = hit["cfiBase"].encode("utf-8")
                item['spinePos'] = hit["spinePos"].encode("utf-8")
                results.append(item)

            return results

    def __get_text(self, filename):
        # html = urllib.urlopen('http://www.nytimes.com/2009/12/21/us/21storm.html').read()
        html = open(filename, "r")
        soup = BeautifulSoup(html, "lxml")
        texts = soup.findAll(text=True)

        def visible(element):
            if element.parent.name in [
                    'style', 'script', '[document]', 'head', 'title'
            ]:
                return False
            elif re.match('<!--.*-->', str(element.encode('utf-8'))):
                return False
            return True

        visible_texts = filter(visible, texts)

        contents = ' '.join([s for s in visible_texts])

        return contents.strip()  #.encode('utf-8')
예제 #33
0
def init_extensions(app):
    # 调用 PyMongo 类的 init_app 方法进行初始化
    mongo.init_app(app)

    # 调用 init_app 方法注册 app
    # 此方法的主要作用就是将 login_manager 本身赋值给 app.login_manager 属性
    # 以便 app 能够使用其登录登出等功能
    login_manager.init_app(app)

    # # 使用 cache 可以加快web程序运行速度,即 采用缓存的方法以空间换时间
    if app.config.get('USE_CACHE', False):
        cache.init_app(app, {})

# 获取配置信息并存储在 app 上
    configure_uploads(app, upload_photos)

    # 获取邮箱的配置
    mail.init_app(app)

    # 获取搜索服务的配置
    whoosh_searcher.init_app(app)
    # 使用 jieba 中文分词
    chinese_analyzer = ChineseAnalyzer()
    # 建立索引模式对象
    post_schema = Schema(obj_id=ID(unique=True, stored=True),
                         title=TEXT(stored=True, analyzer=chinese_analyzer),
                         content=TEXT(stored=True, analyzer=chinese_analyzer),
                         create_at=DATETIME(stored=True),
                         topic_id=ID(stored=True),
                         user_id=ID(stored=True))
    whoosh_searcher.add_index('posts', post_schema)

    # 获取后台管理界面的相关配置
    admin.init_app(app)
    with app.app_context():
        admin.add_view(admin_view.OptionsModelView(mongo.db['options'],
                                                   '系统设置'))
        admin.add_view(admin_view.UsersModelView(mongo.db['users'], '用户管理'))
        admin.add_view(
            admin_view.TopicsModelView(mongo.db['topics'],
                                       '话题管理',
                                       category='内容管理'))
        admin.add_view(
            admin_view.PostsModelView(mongo.db['posts'],
                                      '问答管理',
                                      category='内容管理'))
        # admin.add_view(admin_view.IndexModelView(mongo.db['index_article'],
        #         '主页文章管理', category='内容管理'))
        admin.add_view(
            admin_view.PassagewaysModelView(mongo.db['passageways'],
                                            '温馨通道',
                                            category='推广管理'))
        admin.add_view(
            admin_view.FriendLinksModelView(mongo.db['friend_links'],
                                            '友链管理',
                                            category='推广管理'))
        admin.add_view(
            admin_view.PagesModelView(mongo.db['pages'],
                                      '页面管理',
                                      category='推广管理'))
        admin.add_view(
            admin_view.FooterLinksModelView(mongo.db['footer_links'],
                                            '底部链接',
                                            category='推广管理'))
        admin.add_view(
            admin_view.AdsModelView(mongo.db['ads'], '广告管理', category='推广管理'))
예제 #34
0
import jieba
import sys

reload(sys)
sys.setdefaultencoding('utf-8')
from whoosh.fields import Schema, STORED, ID, KEYWORD, TEXT
from whoosh.index import create_in
from whoosh.index import open_dir
from jieba.analyse import ChineseAnalyzer
# from pyltp import Segmentor
analyzer = ChineseAnalyzer()
import os.path
# import pyltp
filename_list = []
ID_list = []
schema = Schema(title=TEXT, content=TEXT)

schema = Schema(
    title=TEXT(stored=True, analyzer=analyzer),
    content=TEXT(stored=True, analyzer=analyzer),
    ID=ID(stored=True),
    tags=KEYWORD,
    icon=STORED,
)

path_generator = os.walk("./blog_engine")
for path, d, filelist in path_generator:
    for filename in d:

        ix = open_dir("/home/iiip/桌面/blog_engine/" + filename)
예제 #35
0
def get_schema():
    return Schema(title=TEXT(stored=True), date_start=DATETIME(stored=True), date_end=DATETIME(stored=True),
                  description=TEXT(stored=True), categoria=TEXT(stored=True))
예제 #36
0
# -*- coding: utf-8 -*-

#http://blog.csdn.net/twsxtd/article/details/8308893

最近想做一个搜索引擎,当然少不了看下闻名遐迩的Lucene,不得不说确实非常出色,但是对于python的实现pylucene确是差强人意,首先它 不是纯python实现
而是做了一层包装到头来还是使用java,依赖于JDK不说安装步骤繁琐至极,而且Lucene可用的中文分词词库非常之多但是由 于这层粘合关系很多都用不上,
最终还是放弃,不过平心而论如果用Java实现的确很完美。其它的有sphinx以及基于它实现的专门用于中文的 coreseek,不过看了下好像是基于SQL语言的,
对于我要做的事情好像关系不大;还有用C++写的xapian框架,可以说是一片好评啊,速度精度 都非常不错,但最终还是看上了纯python实现的Whoosh,
首先对于python使用来说非常简单,就是一个模块,easy_install就行, 但是搜了一下国内的资料非常之少,没有办法,就把它的文档翻译一下吧~~今天开始

Quick Start
    Whoosh是一个索引文本和搜索文本的类库,他可以为你提供搜索文本的服务,比如如果你在创建一个博客的软件,你可以用whoosh为它添加添加一个搜索功能以便用户来搜索博客的入口
下面是一个简短的例子:
from whoosh.index import create_in
from whoosh.fields import *
schema = Schema(title = TEXT(stored = True),path = ID(stored=True),content=TEXT)
ix = create_in("/home/gswewf/百科/indexer",schema)#(这里的“indexer”实际上是一个目录,因此按照这个步骤来会出错,你得先创建目录,译者注)
writer = ix.writer()
writer.add_document(title=u"First document",path=u"/a",
                    content = u"this is the first document we've add!")
writer.add_document(title=u"Second document", path=u"/b",
                        ...                     content=u"The second one is even more interesting!")
writer.commit()
from whoosh.qparser import QueryParser
with ix.searcher() as searcher:
    query = QueryParser("content", ix.schema).parse("first")
    results = searcher.search(query)
    results[0]
{"title": u"First document", "path": u"/a"}

Index和Schema对象
예제 #37
0
    def __init__(self, index_dir, backend, wiki_name=None, acl_rights_contents=[], **kw):
        """
        Store params, create schemas.
        """
        self.index_dir = index_dir
        self.index_dir_tmp = index_dir + '.temp'
        self.backend = backend
        self.wikiname = wiki_name
        self.ix = {}  # open indexes
        self.schemas = {}  # existing schemas

        common_fields = {
            # wikiname so we can have a shared index in a wiki farm, always check this!
            WIKINAME: ID(stored=True),
            # tokenized NAME from metadata - use this for manual searching from UI
            NAME: TEXT(stored=True, multitoken_query="and", analyzer=item_name_analyzer(), field_boost=2.0),
            # unmodified NAME from metadata - use this for precise lookup by the code.
            # also needed for wildcard search, so the original string as well as the query
            # (with the wildcard) is not cut into pieces.
            NAME_EXACT: ID(field_boost=3.0),
            # revision id (aka meta id)
            REVID: ID(unique=True, stored=True),
            # parent revision id
            PARENTID: ID(stored=True),
            # MTIME from revision metadata (converted to UTC datetime)
            MTIME: DATETIME(stored=True),
            # tokenized CONTENTTYPE from metadata
            CONTENTTYPE: TEXT(stored=True, multitoken_query="and", analyzer=MimeTokenizer()),
            # unmodified list of TAGS from metadata
            TAGS: ID(stored=True),
            LANGUAGE: ID(stored=True),
            # USERID from metadata
            USERID: ID(stored=True),
            # ADDRESS from metadata
            ADDRESS: ID(stored=True),
            # HOSTNAME from metadata
            HOSTNAME: ID(stored=True),
            # SIZE from metadata
            SIZE: NUMERIC(stored=True),
            # ACTION from metadata
            ACTION: ID(stored=True),
            # tokenized COMMENT from metadata
            COMMENT: TEXT(stored=True),
            # SUMMARY from metadata
            SUMMARY: TEXT(stored=True),
            # data (content), converted to text/plain and tokenized
            CONTENT: TEXT(stored=True),
        }

        latest_revs_fields = {
            # ITEMID from metadata - as there is only latest rev of same item here, it is unique
            ITEMID: ID(unique=True, stored=True),
            # unmodified list of ITEMLINKS from metadata
            ITEMLINKS: ID(stored=True),
            # unmodified list of ITEMTRANSCLUSIONS from metadata
            ITEMTRANSCLUSIONS: ID(stored=True),
            # tokenized ACL from metadata
            ACL: TEXT(analyzer=AclTokenizer(acl_rights_contents), multitoken_query="and", stored=True),
        }
        latest_revs_fields.update(**common_fields)

        userprofile_fields = {
            # Note: email / openid (if given) should be unique, but we might
            # have lots of empty values if it is not given and thus it is NOT
            # unique overall! Wrongly declaring it unique would lead to whoosh
            # killing other users from index when update_document() is called!
            EMAIL: ID(stored=True),
            OPENID: ID(stored=True),
        }
        latest_revs_fields.update(**userprofile_fields)

        all_revs_fields = {
            ITEMID: ID(stored=True),
        }
        all_revs_fields.update(**common_fields)

        latest_revisions_schema = Schema(**latest_revs_fields)
        all_revisions_schema = Schema(**all_revs_fields)

        # Define dynamic fields
        dynamic_fields = [("*_id", ID(stored=True)),
                          ("*_text", TEXT(stored=True)),
                          ("*_keyword", KEYWORD(stored=True)),
                          ("*_numeric", NUMERIC(stored=True)),
                          ("*_datetime", DATETIME(stored=True)),
                          ("*_boolean", BOOLEAN(stored=True)),
                         ]

        # Adding dynamic fields to schemas
        for glob, field_type in dynamic_fields:
            latest_revisions_schema.add(glob, field_type, glob=True)
            all_revisions_schema.add(glob, field_type, glob=True)

        # schemas are needed by query parser and for index creation
        self.schemas[ALL_REVS] = all_revisions_schema
        self.schemas[LATEST_REVS] = latest_revisions_schema

        # what fields could whoosh result documents have (no matter whether all revs index
        # or latest revs index):
        self.common_fields = set(latest_revs_fields.keys()) & set(all_revs_fields.keys())
예제 #38
0
    def store_revision(self, meta, data, overwrite=False,
                       trusted=False,  # True for loading a serialized representation or other trusted sources
                       name=None,  # TODO name we decoded from URL path
                       action=ACTION_SAVE,
                       remote_addr=None,
                       userid=None,
                       wikiname=None,
                       contenttype_current=None,
                       contenttype_guessed=None,
                       acl_parent=None,
                       return_rev=False,
                       fqname=None,
                       ):
        """
        Store a revision into the backend, write metadata and data to it.

        Usually this will be a new revision, either of an existing item or
        a new item. With overwrite mode, we can also store over existing
        revisions.

        :type meta: dict
        :type data: open file (file must be closed by caller)
        :param overwrite: if True, allow overwriting of existing revs.
        :param return_rev: if True, return a Revision instance of the just created revision
        :returns: a Revision instance or None
        """
        if remote_addr is None:
            try:
                # if we get here outside a request, this won't work:
                remote_addr = unicode(request.remote_addr)
            except:
                pass
        if userid is None:
            try:
                # if we get here outside a request, this won't work:
                userid = flaskg.user.valid and flaskg.user.itemid or None
            except:
                pass
        if wikiname is None:
            wikiname = app.cfg.interwikiname
        state = {'trusted': trusted,
                 NAME: [name],
                 ACTION: action,
                 ADDRESS: remote_addr,
                 USERID: userid,
                 WIKINAME: wikiname,
                 NAMESPACE: None,
                 ITEMID: self.itemid,  # real itemid or None
                 'contenttype_current': contenttype_current,
                 'contenttype_guessed': contenttype_guessed,
                 'acl_parent': acl_parent,
                 FQNAME: fqname,
                }
        ct = meta.get(CONTENTTYPE)
        if ct == CONTENTTYPE_USER:
            Schema = UserMetaSchema
        else:
            Schema = ContentMetaSchema
        m = Schema(meta)
        valid = m.validate(state)
        if not valid:
            logging.warning("metadata validation failed, see below")
            for e in m.children:
                logging.warning("{0}, {1}".format(e.valid, e))
            logging.warning("data validation skipped as we have no valid metadata")
            if VALIDATION_HANDLING == VALIDATION_HANDLING_STRICT:
                raise ValueError('metadata validation failed and strict handling requested, see the log for details')

        # we do not have anything in m that is not defined in the schema,
        # e.g. userdefined meta keys or stuff we do not validate. thus, we
        # just update the meta dict with the validated stuff:
        meta.update(dict(m.value.items()))
        # we do not want None / empty values:
        # XXX do not kick out empty lists before fixing NAME processing:
        meta = dict([(k, v) for k, v in meta.items() if v not in [None, ]])

        if valid and not validate_data(meta, data):  # need valid metadata to validate data
            logging.warning("data validation failed")
            if VALIDATION_HANDLING == VALIDATION_HANDLING_STRICT:
                raise ValueError('data validation failed and strict handling requested, see the log for details')

        if self.itemid is None:
            self.itemid = meta[ITEMID]
        backend = self.backend
        if not overwrite:
            revid = meta.get(REVID)
            if revid is not None and revid in backend:
                raise ValueError('need overwrite=True to overwrite existing revisions')
        meta, data, content = self.preprocess(meta, data)
        data.seek(0)  # rewind file
        backend_name, revid = backend.store(meta, data)
        meta[REVID] = revid
        self.indexer.index_revision(meta, content, backend_name)
        if not overwrite:
            self._current = self.indexer._document(revid=revid)
        if return_rev:
            return Revision(self, revid)
예제 #39
0
def get_schema():
    return Schema(remitente=TEXT(stored=True),
                  destinatarios=KEYWORD(stored=True),
                  asunto=TEXT(stored=True),
                  contenido=TEXT(stored=True))
예제 #40
0
파일: store.py 프로젝트: leifj/pyFF
class WhooshStore(SAMLStoreBase):

    def __init__(self):
        self.schema = Schema(scopes=KEYWORD(),
                             descr=TEXT(),
                             service_name=TEXT(),
                             service_descr=TEXT(),
                             keywords=KEYWORD())
        self.schema.add("object_id", ID(stored=True, unique=True))
        self.schema.add("entity_id", ID(stored=True, unique=True))
        for a in list(ATTRS.keys()):
            self.schema.add(a, KEYWORD())
        self._collections = set()
        from whoosh.filedb.filestore import RamStorage, FileStorage
        self.storage = RamStorage()
        self.storage.create()
        self.index = self.storage.create_index(self.schema)
        self.objects = dict()
        self.infos = dict()

    def dump(self):
        ix = self.storage.open_index()
        print(ix.schema)
        from whoosh.query import Every
        with ix.searcher() as searcher:
            for result in ix.searcher().search(Every('object_id')):
                print(result)

    def _index_prep(self, info):
        if 'entity_attributes' in info:
            for a, v in list(info.pop('entity_attributes').items()):
                info[a] = v
        for a, v in list(info.items()):
            if type(v) is not list and type(v) is not tuple:
                info[a] = [info.pop(a)]

            if a in ATTRS_INV:
                info[ATTRS_INV[a]] = info.pop(a)

        for a in list(info.keys()):
            if a not in self.schema.names():
                del info[a]

        for a, v in list(info.items()):
            info[a] = [six.text_type(vv) for vv in v]

    def _index(self, e, tid=None):
        info = entity_info(e)
        if tid is not None:
            info['collection_id'] = tid
        self._index_prep(info)
        id = six.text_type(object_id(e))
        # mix in tid here
        self.infos[id] = info
        self.objects[id] = e
        ix = self.storage.open_index()
        with ix.writer() as writer:
            writer.add_document(object_id=id, **info)
            writer.mergetype = writing.CLEAR

    def update(self, t, tid=None, ts=None, merge_strategy=None):
        relt = root(t)
        assert (relt is not None)
        ne = 0

        if relt.tag == "{%s}EntityDescriptor" % NS['md']:
            self._index(relt)
            ne += 1
        elif relt.tag == "{%s}EntitiesDescriptor" % NS['md']:
            if tid is None:
                tid = relt.get('Name')
            self._collections.add(tid)
            for e in iter_entities(t):
                self._index(e, tid=tid)
                ne += 1

        return ne

    def collections(self):
        return b2u(self._collections)

    def reset(self):
        self.__init__()

    def size(self, a=None, v=None):
        if a is None:
            return len(list(self.objects.keys()))
        elif a is not None and v is None:
            return len(self.attribute(a))
        else:
            return len(self.lookup("{!s}={!s}".format(a, v)))

    def _attributes(self):
        ix = self.storage.open_index()
        with ix.reader() as reader:
            for n in reader.indexed_field_names():
                if n in ATTRS:
                    yield b2u(ATTRS[n])

    def attributes(self):
        return b2u(list(self._attributes()))

    def attribute(self, a):
        if a in ATTRS_INV:
            n = ATTRS_INV[a]
            ix = self.storage.open_index()
            with ix.searcher() as searcher:
                return b2u(list(searcher.lexicon(n)))
        else:
            return []

    def lookup(self, key, raw=True, field="entity_id"):
        if key == 'entities' or key is None:
            if raw:
                return b2u(list(self.objects.values()))
            else:
                return b2u(list(self.infos.values()))

        from whoosh.qparser import QueryParser
        # import pdb; pdb.set_trace()
        key = key.strip('+')
        key = key.replace('+', ' AND ')
        for uri, a in list(ATTRS_INV.items()):
            key = key.replace(uri, a)
        key = " {!s} ".format(key)
        key = re.sub("([^=]+)=(\S+)", "\\1:\\2", key)
        key = re.sub("{([^}]+)}(\S+)", "\\1:\\2", key)
        key = key.strip()

        qp = QueryParser("object_id", schema=self.schema)
        q = qp.parse(key)
        lst = set()
        with self.index.searcher() as searcher:
            results = searcher.search(q, limit=None)
            for result in results:
                if raw:
                    lst.add(self.objects[result['object_id']])
                else:
                    lst.add(self.infos[result['object_id']])

        return b2u(list(lst))
예제 #41
0
def init_indexes_and_parsers():
    path = app.config['SEARCH_INDEX_PATH']
    # Initialize the documentations index
    name = 'doc'
    if exists_in(path, indexname=name):
        indexes['doc'] = open_dir(path, indexname=name)
    else:
        try:
            os.makedirs(path)
        except OSError:
            pass
        schema = Schema(
                        id=ID(stored=True, unique=True),
                        )
        schema.add(
                'title_*',
                TEXT(field_boost=2.0, analyzer=domotego_analyzer),
                glob=True
                )
        schema.add('text_*', TEXT(analyzer=domotego_analyzer), glob=True)
        indexes['doc'] = create_in(path, schema, indexname=name)
        index_docs(Page.objects(pagetype='doc'))
    # Initialize the categories index
    name = 'category'
    if exists_in(path, indexname=name):
        indexes['category'] = open_dir(path, indexname=name)
    else:
        try:
            os.makedirs(path)
        except OSError:
            pass
        schema = Schema(
                        id=ID(stored=True, unique=True),
                        )
        schema.add(
                'name_*',
                TEXT(field_boost=2.0, analyzer=domotego_analyzer),
                glob=True
                )
        schema.add(
                'description_*',
                TEXT(analyzer=domotego_analyzer),
                glob=True
                )
        indexes['category'] = create_in(path, schema, indexname=name)
        index_categories(Category.objects)
    # Initialize the products index
    name = 'product'
    if exists_in(path, indexname=name):
        indexes['product'] = open_dir(path, indexname=name)
    else:
        try:
            os.makedirs(path)
        except OSError:
            pass
        schema = Schema(
                        id=ID(stored=True, unique=True),
                        reference=KEYWORD,
                        keywords=KEYWORD(lowercase=True, field_boost=1.5)
                        )
        schema.add(
                'name_*',
                TEXT(field_boost=2.0, analyzer=domotego_analyzer),
                glob=True
                )
        schema.add(
                'description_*',
                TEXT(analyzer=domotego_analyzer),
                glob=True
                )
        indexes['product'] = create_in(path, schema, indexname=name)
        index_products(BaseProduct.objects)

    # Initialize the parsers
    docparserfields = []
    categoryparserfields = []
    productparserfields = ['reference', 'keywords']
    for lg in app.config['LANGS']:
        docparserfields.append('title_'+lg)
        docparserfields.append('text_'+lg)
        categoryparserfields.append('name_'+lg)
        categoryparserfields.append('description_'+lg)
        productparserfields.append('name_'+lg)
        productparserfields.append('description_'+lg)
    parsers['doc'] = qparser.MultifieldParser(
                                    docparserfields,
                                    schema=indexes['doc'].schema,
                                    termclass=FuzzierTerm
                                    )
    parsers['category'] = qparser.MultifieldParser(
                                    categoryparserfields,
                                    schema=indexes['category'].schema,
                                    termclass=FuzzierTerm
                                    )
    parsers['product'] = qparser.MultifieldParser(
                                    productparserfields,
                                    schema=indexes['product'].schema,
                                    termclass=FuzzierTerm
                                    )
예제 #42
0
    def index_corpus(self):
        """Make a Whoosh index out of a pre-processed corpus, ie TLG, PHI5,
        or PHI7.

        TLG takes almost 13 min; PHI5 1.5 min.
        To setup index parameters
        >>> # cltk_index = CLTKIndex('latin', 'phi5')  # 1.5 min, 363 docs
        >>> # cltk_index = CLTKIndex('latin', 'phi5', chunk='work')  # 2 min, 837 docs
        >>> # cltk_index = CLTKIndex('greek', 'tlg')  # 13 min, 1823 docs
        >>> # cltk_index = CLTKIndex('greek', 'tlg', chunk='work')  #15.5 min, 6625 docs

        # And to start indexing:
        >>> # cltk_index.index_corpus()

        TODO: Prevent overwriting. Ask user to rm old dir before re-indexing.
        TODO: Add option for lemmatizing.
        TODO: Add for figure out lower() options.
        TODO: Process TLG through forthcoming normalize().
        TODO: Add name to each index.
        TODO: Turn off any language-specific mods (eg, stemming, case) that
        Whoosh might be doing by default.
        """

        # Setup index dir
        schema = Schema(path=ID(stored=True),
                        author=TEXT(stored=True),
                        content=TEXT)
        try:
            _index = create_in(self.index_path, schema)
        except FileNotFoundError:
            os.makedirs(self.index_path)
            _index = create_in(self.index_path, schema)
        writer = _index.writer()

        # Setup corpus to be indexed
        if self.lang == 'greek' and self.corpus == 'tlg':
            corpus_path = os.path.normpath(get_cltk_data_dir() +
                                           '/greek/text/tlg/plaintext/')
            if self.chunk == 'work':
                corpus_path = os.path.normpath(
                    get_cltk_data_dir() + '/greek/text/tlg/individual_works/')
        elif self.lang == 'latin' and self.corpus == 'phi5':
            corpus_path = os.path.normpath(get_cltk_data_dir() +
                                           '/latin/text/phi5/plaintext/')
            if self.chunk == 'work':
                corpus_path = os.path.normpath(
                    get_cltk_data_dir() + '/latin/text/phi5/individual_works/')
        assert os.path.isdir(corpus_path), 'Corpus does not exist in the following location: "%s". Use CLTK Corpus Importer and TLGU to create transformed corpus.' % corpus_path  # pylint: disable=line-too-long

        files = os.listdir(corpus_path)
        if self.lang == 'greek' and self.corpus == 'tlg':
            files = [f[:-4] for f in files if f.startswith('TLG')]
            corpus_index = TLG_AUTHOR_MAP
        elif self.lang == 'latin' and self.corpus == 'phi5':
            files = [f[:-4] for f in files if f.startswith('LAT')]
            corpus_index = PHI5_AUTHOR_MAP

        time_0 = time.time()
        logger.info("Commencing indexing of %s documents of '%s' corpus." % (len(files), self.corpus))  # pylint: disable=line-too-long
        logger.info('Index will be written to: "%s".' % self.index_path)
        if self.chunk == 'author':
            for count, file in enumerate(files, 1):

                try:
                    if self.lang == 'greek' and self.corpus == 'tlg':
                        file = file[3:]
                        author = corpus_index[file]
                        path = os.path.join(corpus_path, 'TLG' + file + '.TXT')
                    if self.lang == 'latin' and self.corpus == 'phi5':
                        author = corpus_index[file]
                        path = os.path.join(corpus_path, file + '.TXT')
                except KeyError as key_error:
                    if file.startswith('LAT9999'):
                        continue
                    logger.error(key_error)
                    raise

                with open(path) as file_open:
                    content = file_open.read()
                writer.add_document(path=path, author=author, content=content)

                if count % 100 == 0:
                    logger.info('Indexed doc %s.' % count)

        if self.chunk == 'work':
            for count, file in enumerate(files, 1):
                try:
                    if self.lang == 'greek' and self.corpus == 'tlg':
                        path = os.path.join(corpus_path, file + '.TXT')
                        author = corpus_index[file[3:-8]]
                    if self.lang == 'latin' and self.corpus == 'phi5':
                        path = os.path.join(corpus_path, file + '.TXT')
                        author = corpus_index[file[:-8]]
                except KeyError as key_error:
                    if file.startswith('LAT9999'):
                        continue
                    logger.error(key_error)
                    raise

                with open(path) as file_open:
                    content = file_open.read()

                writer.add_document(path=path, author=author, content=content)
                if count % 100 == 0:
                    logger.info('Indexed doc %s.' % count)
        logger.info('Commencing to commit changes.')
        writer.commit()

        time_1 = time.time()
        elapsed = time_1 - time_0
        logger.info('Finished indexing all documents in %s seconds (averaging %s docs per sec.)' % (elapsed, (len(files) / elapsed)))  # pylint: disable=line-too-long
예제 #43
0
    def __init__(self, index_storage, backend, wiki_name=None, acl_rights_contents=[], **kw):
        """
        Store params, create schemas.
        """
        self.index_storage = index_storage
        self.backend = backend
        self.wikiname = wiki_name
        self.ix = {}  # open indexes
        self.schemas = {}  # existing schemas

        common_fields = {
            # wikiname so we can have a shared index in a wiki farm, always check this!
            WIKINAME: ID(stored=True),
            # namespace, so we can have different namespaces within a wiki, always check this!
            NAMESPACE: ID(stored=True),
            # tokenized NAME from metadata - use this for manual searching from UI
            NAME: TEXT(stored=True, multitoken_query="and", analyzer=item_name_analyzer(), field_boost=2.0),
            # unmodified NAME from metadata - use this for precise lookup by the code.
            # also needed for wildcard search, so the original string as well as the query
            # (with the wildcard) is not cut into pieces.
            NAME_EXACT: ID(field_boost=3.0),
            # revision id (aka meta id)
            REVID: ID(unique=True, stored=True),
            # parent revision id
            PARENTID: ID(stored=True),
            # backend name (which backend is this rev stored in?)
            BACKENDNAME: ID(stored=True),
            # MTIME from revision metadata (converted to UTC datetime)
            MTIME: DATETIME(stored=True),
            # publish time from metadata (converted to UTC datetime)
            PTIME: DATETIME(stored=True),
            # ITEMTYPE from metadata, always matched exactly hence ID
            ITEMTYPE: ID(stored=True),
            # tokenized CONTENTTYPE from metadata
            CONTENTTYPE: TEXT(stored=True, multitoken_query="and", analyzer=MimeTokenizer()),
            # unmodified list of TAGS from metadata
            TAGS: ID(stored=True),
            LANGUAGE: ID(stored=True),
            # USERID from metadata
            USERID: ID(stored=True),
            # ADDRESS from metadata
            ADDRESS: ID(stored=True),
            # HOSTNAME from metadata
            HOSTNAME: ID(stored=True),
            # SIZE from metadata
            SIZE: NUMERIC(stored=True),
            # ACTION from metadata
            ACTION: ID(stored=True),
            # tokenized COMMENT from metadata
            COMMENT: TEXT(stored=True),
            # SUMMARY from metadata
            SUMMARY: TEXT(stored=True),
            # DATAID from metadata
            DATAID: ID(stored=True),
            # TRASH from metadata
            TRASH: BOOLEAN(stored=True),
            # data (content), converted to text/plain and tokenized
            CONTENT: TEXT(stored=True, spelling=True),
        }

        latest_revs_fields = {
            # ITEMID from metadata - as there is only latest rev of same item here, it is unique
            ITEMID: ID(unique=True, stored=True),
            # unmodified list of ITEMLINKS from metadata
            ITEMLINKS: ID(stored=True),
            # unmodified list of ITEMTRANSCLUSIONS from metadata
            ITEMTRANSCLUSIONS: ID(stored=True),
            # tokenized ACL from metadata
            ACL: TEXT(analyzer=AclTokenizer(acl_rights_contents), multitoken_query="and", stored=True),
            # ngram words, index ngrams of words from main content
            CONTENTNGRAM: NGRAMWORDS(minsize=3, maxsize=6),
        }
        latest_revs_fields.update(**common_fields)

        userprofile_fields = {
            # Note: email / openid (if given) should be unique, but we might
            # have lots of empty values if it is not given and thus it is NOT
            # unique overall! Wrongly declaring it unique would lead to whoosh
            # killing other users from index when update_document() is called!
            EMAIL: ID(stored=True),
            OPENID: ID(stored=True),
            DISABLED: BOOLEAN(stored=True),
            LOCALE: ID(stored=True),
            SUBSCRIPTION_IDS: ID(),
            SUBSCRIPTION_PATTERNS: ID(),
        }
        latest_revs_fields.update(**userprofile_fields)

        # XXX This is a highly adhoc way to support indexing of ticket items.
        ticket_fields = {
            EFFORT: NUMERIC(stored=True),
            DIFFICULTY: NUMERIC(stored=True),
            SEVERITY: NUMERIC(stored=True),
            PRIORITY: NUMERIC(stored=True),
            ASSIGNED_TO: ID(stored=True),
            SUPERSEDED_BY: ID(stored=True),
            DEPENDS_ON: ID(stored=True),
            CLOSED: BOOLEAN(stored=True),
        }
        latest_revs_fields.update(**ticket_fields)

        blog_entry_fields = {
        }
        latest_revs_fields.update(**blog_entry_fields)

        all_revs_fields = {
            ITEMID: ID(stored=True),
        }
        all_revs_fields.update(**common_fields)

        latest_revisions_schema = Schema(**latest_revs_fields)
        all_revisions_schema = Schema(**all_revs_fields)

        # Define dynamic fields
        dynamic_fields = [("*_id", ID(stored=True)),
                          ("*_text", TEXT(stored=True)),
                          ("*_keyword", KEYWORD(stored=True)),
                          ("*_numeric", NUMERIC(stored=True)),
                          ("*_datetime", DATETIME(stored=True)),
                          ("*_boolean", BOOLEAN(stored=True)),
                         ]

        # Adding dynamic fields to schemas
        for glob, field_type in dynamic_fields:
            latest_revisions_schema.add(glob, field_type, glob=True)
            all_revisions_schema.add(glob, field_type, glob=True)

        # schemas are needed by query parser and for index creation
        self.schemas[ALL_REVS] = all_revisions_schema
        self.schemas[LATEST_REVS] = latest_revisions_schema

        # what fields could whoosh result documents have (no matter whether all revs index
        # or latest revs index):
        self.common_fields = set(latest_revs_fields.keys()) & set(all_revs_fields.keys())
예제 #44
0
 def __init__(self, tfidf_path, strict=True):
     schema = Schema(docid=ID(stored=True), content=TEXT(stored=True))
     self.ix = open_dir(tfidf_path)
     self.searcher = self.ix.searcher()