Пример #1
0
    def build_schema(self, fields):
        schema_fields = {
            ID: WHOOSH_ID(stored=True, unique=True),
            DJANGO_CT: WHOOSH_ID(stored=True),
            DJANGO_ID: WHOOSH_ID(stored=True),
        }
        # Grab the number of keys that are hard-coded into Haystack.
        # We'll use this to (possibly) fail slightly more gracefully later.
        initial_key_count = len(schema_fields)
        content_field_name = ''

        for field_name, field_class in fields.items():
            if field_class.is_multivalued:
                if field_class.indexed is False:
                    schema_fields[field_class.index_fieldname] = IDLIST(stored=True, field_boost=field_class.boost)
                else:
                    schema_fields[field_class.index_fieldname] = KEYWORD(stored=True, commas=True, scorable=True,
                                                                         field_boost=field_class.boost)
            elif field_class.field_type in ['date', 'datetime']:
                schema_fields[field_class.index_fieldname] = DATETIME(stored=field_class.stored, sortable=True)
            elif field_class.field_type == 'integer':
                schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, numtype=int,
                                                                     field_boost=field_class.boost)
            elif field_class.field_type == 'float':
                schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, numtype=float,
                                                                     field_boost=field_class.boost)
            elif field_class.field_type == 'boolean':
                # Field boost isn't supported on BOOLEAN as of 1.8.2.
                schema_fields[field_class.index_fieldname] = BOOLEAN(stored=field_class.stored)
            elif field_class.field_type == 'ngram':
                schema_fields[field_class.index_fieldname] = NGRAM(minsize=3, maxsize=15, stored=field_class.stored,
                                                                   field_boost=field_class.boost)
            elif field_class.field_type == 'edge_ngram':
                schema_fields[field_class.index_fieldname] = NGRAMWORDS(minsize=2, maxsize=15, at='start',
                                                                        stored=field_class.stored,
                                                                        field_boost=field_class.boost)
            else:
                # schema_fields[field_class.index_fieldname] = TEXT(stored=True, analyzer=StemmingAnalyzer(), field_boost=field_class.boost, sortable=True)
                # 中文分词
                schema_fields[field_class.index_fieldname] = TEXT(stored=True, analyzer=ChineseAnalyzer(),
                                                                  field_boost=field_class.boost, sortable=True)

            if field_class.document is True:
                content_field_name = field_class.index_fieldname
                schema_fields[field_class.index_fieldname].spelling = True

        # Fail more gracefully than relying on the backend to die if no fields
        # are found.
        if len(schema_fields) <= initial_key_count:
            raise SearchBackendError(
                "No fields were found in any search_indexes. Please correct this before attempting to search.")

        return (content_field_name, Schema(**schema_fields))
    def build_schema(self, fields):
        schema_fields = {
            ID: WHOOSH_ID(stored=True, unique=True),
            DJANGO_CT: WHOOSH_ID(stored=True),
            DJANGO_ID: WHOOSH_ID(stored=True),
        }
        # Grab the number of keys that are hard-coded into Haystack.
        # We'll use this to (possibly) fail slightly more gracefully later.
        initial_key_count = len(schema_fields)
        content_field_name = ''

        for field_name, field_class in fields.items():
            if field_class.is_multivalued:
                if field_class.indexed is False:
                    schema_fields[field_class.index_fieldname] = IDLIST(
                        stored=True)
                else:
                    schema_fields[field_class.index_fieldname] = KEYWORD(
                        stored=True, commas=True, scorable=True)
            elif field_class.field_type in ['date', 'datetime']:
                schema_fields[field_class.index_fieldname] = DATETIME(
                    stored=field_class.stored)
            elif field_class.field_type == 'integer':
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored, type=int)
            elif field_class.field_type == 'float':
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored, type=float)
            elif field_class.field_type == 'boolean':
                schema_fields[field_class.index_fieldname] = BOOLEAN(
                    stored=field_class.stored)
            else:
                schema_fields[field_class.index_fieldname] = TEXT(
                    stored=True, analyzer=StemmingAnalyzer())

            if field_class.document is True:
                content_field_name = field_class.index_fieldname

        # Fail more gracefully than relying on the backend to die if no fields
        # are found.
        if len(schema_fields) <= initial_key_count:
            raise SearchBackendError(
                "No fields were found in any search_indexes. Please correct this before attempting to search."
            )

        return (content_field_name, Schema(**schema_fields))
Пример #3
0
 def __init__(self, th):
     BaseSearchEngine.__init__(self, th, False)
     self.__ix_writer = None
     ix_dir = os.path.join(th.prefixes[0], 'index',
                           "ix_" + str(whoosh_ix_ver))
     if not os.path.isdir(ix_dir):
         os.makedirs(ix_dir)
     # try to load a pre-existing index
     try:
         self.indexer = open_dir(ix_dir)
     except (EmptyIndexError, IndexVersionError):
         # create a new one
         try:
             shutil.rmtree(ix_dir, True)
             os.makedirs(ix_dir)
         except OSError:
             pass
         schema = Schema(
             kitab=ID(stored=True),
             vrr=ID(stored=True, unique=False),  # version release
             nodeIdNum=ID(stored=True, unique=False),
             title=TEXT(stored=True, field_boost=1.5, analyzer=analyzer),
             content=TEXT(stored=False, analyzer=analyzer),
             #content = TEXT(stored = False,analyzer = analyzer,
             #vector = Frequency(analyzer = analyzer)), # with term vector
             tags=IDLIST(stored=False))
         self.indexer = create_in(ix_dir, schema)
     #self.__ix_qparser = ThMultifieldParser(self.th, ("title","content",), schema=self.indexer.schema)
     self.__ix_qparser = MultifieldSQParser((
         "title",
         "content",
     ), self.indexer.schema)
     self.__ix_qparser.add_plugin(
         FieldAliasPlugin({
             u"kitab": (u"كتاب", ),
             u"title": (u"عنوان", ),
             u"tags": (u"وسوم", )
         }))
     #self.__ix_pre = whoosh.query.Prefix
     self.__ix_searcher = self.indexer.searcher()
Пример #4
0
def createIndex(root):   
 
    '''
    Schema definition: title(name of file), path(as ID), content(indexed
    but not stored),textdata (stored text content)
    '''
    schema = Schema(path = ID(stored = True), imgs = IDLIST(stored = True), image = TEXT(stored = True), content = TEXT(stored = True), date = DATETIME(sortable = True))
    if not os.path.exists("index"):
        os.mkdir("index")
    ix = create_in("index", schema)
    writer = ix.writer()

    root = "/home/httpd/nosher.net/docs/archives/computers"
    files = [os.path.join(root, i) for i in os.listdir(root)]
    for f in files:
        if f[-4:] == ".txt":
            with open(f, "r") as fh:
                print (f)
                lines = fh.readlines()
                text = " ".join(lines).replace("\n", "")
                text = re.sub("<.*?>|\[.*?\]", "", text)
                img = "{}/archives/computers/images/{}-s.jpg".format(WEBROOT, f.split("/")[-1].replace(".txt", ""))
                url = "archives/computers/{}".format(f.split("/")[-1].replace(".txt", ""))
                writer.add_document(path = url, content = text, image = img)
                fh.close()

    root = "/home/httpd/nosher.net/docs/images/"
    for path, folder, files in os.walk(root):
        for f in files:
                if f == "details.txt":
                    full = os.path.join(path, f)
                    print (full)
                    try:
                        timestamp = datetime.strptime(path.split("/")[-1][0:10], "%Y-%m-%d")
                    except ValueError:
                        timestamp = datetime.strptime("1989-10-01", "%Y-%m-%d")
                    with open(full, "r") as fh:
                        try:
                            first = 0 
                            text = fh.readlines()
                            if len(text) > 3: 
                                for i in range(0, len(text)):
                                    try:
                                        parts  = text[i].split("\t")
                                        if len(parts) > 1:
                                            # convert to URL path
                                            webpath = path.replace("/home/httpd/nosher.net/docs/", "")
                                            # this is an album description match
                                            if parts[0] == "title" or parts[0] == "intro":
                                                images = []
                                                if len(text) > 7:
                                                    for j in range(3, 7):
                                                        bits = text[j].split("\t")
                                                        images.append(bits[0])
                                                img = "{}/{}/{}-s.jpg".format(WEBROOT, webpath, text[3].split("\t")[0])
                                                writer.add_document(path = webpath, imgs = ",".join(images), content = parts[1], image = img, date = timestamp)
                                            elif parts[0] == "locn":
                                                pass    
                                            else:
                                                # this is a match to an individual photo
                                                if first == 0: first = i
                                                img = "{}/{}/{}-s.jpg".format(WEBROOT, webpath, parts[0])
                                                writer.add_document(path = "{}/{}".format(webpath, i - first), content = parts[1], image = img, date = timestamp)
                                    except ValueError:
                                        print (path, l)
                        except UnicodeDecodeError:
                            print (full + " failed")

            
    writer.commit()
Пример #5
0
    def build_schema(self, fields):
        schema_fields = {
            ID: WHOOSH_ID(stored=True, unique=True),
            DJANGO_CT: WHOOSH_ID(stored=True),
            DJANGO_ID: WHOOSH_ID(stored=True),
        }
        initial_key_count = len(schema_fields)
        content_field_name = ''
        for field_name, field_class in fields.items():
            if field_class.is_multivalued:
                if field_class.indexed is False:
                    schema_fields[field_class.index_fieldname] = IDLIST(
                        stored=True, field_boost=field_class.boost)
                else:
                    schema_fields[field_class.index_fieldname] = KEYWORD(
                        stored=True,
                        commas=True,
                        scorable=True,
                        field_boost=field_class.boost)

            elif field_class.field_type in ['date', 'datetime']:
                schema_fields[field_class.index_fieldname] = DATETIME(
                    stored=field_class.stored, sortable=True)
            elif field_class.field_type == 'integer':
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored,
                    numtype=int,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'float':
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored,
                    numtype=float,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'boolean':
                schema_fields[field_class.index_fieldname] = BOOLEAN(
                    stored=field_class.stored)
            elif field_class.field_type == 'ngram':
                schema_fields[field_class.index_fieldname] = NGRAM(
                    minsize=3,
                    maxsize=15,
                    stored=field_class.stored,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'edge_ngram':
                schema_fields[field_class.index_fieldname] = NGRAMWORDS(
                    minsize=2,
                    maxsize=15,
                    at='start',
                    stored=field_class.stored,
                    field_boost=field_class.boost)
            else:
                # schema_fields[field_class.index_fieldname] = TEXT(stored=True, analyzer=StemmingAnalyzer(), field_boost=field_class.boost, sortable=True)
                schema_fields[field_class.index_fieldname] = TEXT(
                    stored=True,
                    analyzer=ChineseAnalyzer(),
                    field_boost=field_class.boost,
                    sortable=True)
            if field_class.document is True:
                content_field_name = field_class.index_fieldname
                schema_fields[field_class.index_fieldname].spelling = True
        if len(schema_fields) <= initial_key_count:
            raise SearchBackendError(
                "No fields were found in any search_indexes. Please correct this before attempting to search."
            )
        return (content_field_name, Schema(**schema_fields))