class LawSchema(SchemaClass): PK = ID(unique=True) ACTIVE_FLG = BOOLEAN() ARTICLE = ID() ARTICLE_HEADING = TEXT(stored=True) ARTICLE_HISTORY = TEXT(stored=True) CHAPTER = ID() CHAPTER_HEADING = TEXT(stored=True) CODE_HEADING = TEXT(stored=True) DIVISION = ID() DIVISION_HEADING = TEXT(stored=True) EFFECTIVE_DATE = DATETIME(stored=True) HISTORY = TEXT(stored=True) LAW_CODE = ID() LAW_SECTION_VERSION_ID = ID() LEGAL_TEXT = TEXT(stored=True) LOB_FILE = ID() OP_CHAPTER = ID() OP_SECTION = ID() OP_STATUES = ID() PART = ID() SECTION_HISTORY = TEXT(stored=True) SECTION_NUM = ID(stored=True) SECTION_TITLE = TEXT(stored=True) TITLE = ID() TRANS_UID = ID() TRANS_UPDATE = DATETIME()
def get_schema(): return Schema( id=NUMERIC(stored=True, unique=True, numtype=int), title=TEXT(stored=True), content=TEXT(), correspondent=TEXT(stored=True), tag=KEYWORD(stored=True, commas=True, scorable=True, lowercase=True), type=TEXT(stored=True), created=DATETIME(stored=True, sortable=True), modified=DATETIME(stored=True, sortable=True), added=DATETIME(stored=True, sortable=True), )
def get_schema_tema(): return Schema(titulo=TEXT(stored=True), link=TEXT(stored=True), autor=TEXT(stored=True), fecha=DATETIME(stored=True), numRespuestas=NUMERIC(stored=True), numVisitas=NUMERIC(stored=True))
def get_schema_temas(): return Schema(titulo=TEXT(stored=True), link_tema=ID(unique=True, stored=True), autor=KEYWORD(stored=True), fecha=DATETIME(stored=True), n_respuestas=STORED, n_visitas=STORED)
def __init__(self, config): self.schema = Schema( id=ID(unique=True), title=TEXT(stored=True, field_boost=3.0, analyzer=StandardAnalyzer() | NgramFilter(minsize=2, maxsize=3)), author=TEXT(stored=True), creation_date=DATETIME(stored=True), pages=STORED, content=TEXT(stored=True, analyzer=StandardAnalyzer(stoplist=None)), lang=TEXT(stored=True), size=STORED, tags=KEYWORD(stored=True, commas=True) ) self.index_path = config['WHOOSH_INDEX'] if not os.path.exists(self.index_path): os.mkdir(self.index_path) create_in(self.index_path, self.schema) self.indexer = open_dir(self.index_path) self.parser_content = MultifieldParser(["title", "content"], schema=self.schema) self.parser_content.add_plugin(DateParserPlugin()) self.date_format = { 'last_24h': u'-24h to now', 'last_week': u'last week', 'last_month_to_now': u'-1mo to now', 'last_year_to_now': u"[-2yrs to now]" }
def createSearchableDatafromUrl(): url = "https://wanderinginn.com/2016/07/27/1-00/" if not os.path.exists("indexdir"): os.mkdir("indexdir") schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True), textdata=TEXT(stored=True), date=DATETIME(sortable=True), url=ID(stored=True), wordcount=NUMERIC(stored=True, sortable=True)) ix = create_in("indexdir", schema) writer = ix.writer() while True: currentPage = requests.get(url) soup = BeautifulSoup(currentPage.content, "lxml") body = soup.find("div", {"class": "entry-content"}) title = soup.find("h1", {"class": "entry-title"}) p_date = soup.find("time", {"class": "entry-date"}) p_date_converted = datetime.strptime(p_date['datetime'], '%Y-%m-%dT%H:%M:%S+00:00') url_list = body.find_all('a') count = len(re.findall(r'\w+', body.text)) print(title.text) print(url) print(count) print(p_date_converted) writer.add_document(title=title.text, content=body.text, textdata=body.text, date=p_date_converted, url=url, wordcount=count) try: url = url_list[-1].get('href') except: writer.commit() break
def get_schema(): return Schema(titulo=TEXT(stored=True), tituloOriginal=TEXT(stored=True), fechaEstreno=DATETIME(stored=True), director=TEXT(stored=True), reparto=TEXT, sinopsis=TEXT)
class IndexMsg: schema = Schema( content=TEXT(stored=True, analyzer=ChineseAnalyzer()), url=ID(stored=True, unique=True), # for `chat_id` we are using TEXT instead of NUMERIC here, because NUMERIC # do not support iterating all values of the field chat_id=TEXT(stored=True), post_time=DATETIME(stored=True, sortable=True), sender=TEXT(stored=True), ) def __init__(self, content: str, url: str, chat_id: Union[int, str], post_time: datetime, sender: str): self.content = content self.url = url self.chat_id = int(chat_id) self.post_time = post_time self.sender = sender def as_dict(self): return { 'content': self.content, 'url': self.url, 'chat_id': str(self.chat_id), 'post_time': self.post_time, 'sender': self.sender } def __str__(self): return f'IndexMsg' + ', '.join(f'{k}={repr(v)}' for k, v in self.as_dict().items())
def _prepare_writer(): schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=KEYWORD, datetime=DATETIME(stored=True)) ix = create_in(WHOOSH_INDEX_NAME, schema) return ix.writer()
def almacenar_datos(): schem = Schema(categoria=TEXT(stored=True), titulo=TEXT(stored=True), enlaceNoticia=TEXT(stored=True), descripcion=TEXT(stored=True), fecha=DATETIME(stored=True)) if os.path.exists("Index"): shutil.rmtree("Index") os.mkdir("Index") ix = create_in("Index", schema=schem) writer = ix.writer() i = 0 extraer_noticias() for j in range(len(titulos)): writer.add_document(categoria=str(categorias[j]), titulo=str(titulos[j]), enlaceNoticia=str(enlacesNoticias[j]), descripcion=str(descripciones[j]), fecha=fechas[j]) i += 1 writer.commit() messagebox.showinfo("Fin de indexado", "Se han indexado " + str(i) + " noticias")
def get_thread_schema(): return Schema(titulo=TEXT(stored=True), link=ID(stored=True), autor=TEXT(stored=True), fecha=DATETIME(stored=True), respuestas=STORED, visitas=STORED)
def fields_map(self, field_type): if field_type == "primary": return ID(stored=True, unique=True) type_map = { 'date': types.Date, 'datetime': types.DateTime, 'boolean': types.Boolean, 'integer': types.Integer, 'float': types.Float } if isinstance(field_type, str): field_type = type_map.get(field_type, types.Text) if not isinstance(field_type, type): field_type = field_type.__class__ if issubclass(field_type, (types.DateTime, types.Date)): return DATETIME(stored=True, sortable=True) elif issubclass(field_type, types.Integer): return NUMERIC(stored=True, numtype=int) elif issubclass(field_type, types.Float): return NUMERIC(stored=True, numtype=float) elif issubclass(field_type, types.Boolean): return BOOLEAN(stored=True) return TEXT(stored=True, analyzer=self.analyzer, sortable=False)
def almacenar_datos(): # define el esquema de la información schem = Schema(titulo=TEXT(stored=True), autor=TEXT(stored=True), fuente=TEXT(stored=True), link=ID(stored=True), fechahora=DATETIME(stored=True), contenido=TEXT) # eliminamos el directorio del Ãndice, si existe if os.path.exists("Index"): shutil.rmtree("Index") os.mkdir("Index") # creamos el Ãndice ix = create_in("Index", schema=schem) # creamos un writer para poder añadir documentos al indice writer = ix.writer() i = 0 lista = extraer_noticias() for noticia in lista: # añade cada noticia de la lista al Ãndice writer.add_document(titulo=str(noticia[0]), autor=str(noticia[1]), fuente=str(noticia[2]), link=str(noticia[3]), fechahora=noticia[4], contenido=str(noticia[5])) i += 1 writer.commit() messagebox.showinfo("Fin de indexado", "Se han indexado " + str(i) + " noticias")
def get_schema(): return Schema(categoria=TEXT(stored=True), titulo=TEXT(stored=True), enlace=TEXT(stored=True), fecha=DATETIME(stored=True), descripcion=TEXT(stored=True), nombrefichero=ID(stored=True))
def get_schema(): return Schema(remitente=TEXT(stored=True), destinatarios=TEXT(stored=True), fecha=DATETIME(stored=True), asunto=TEXT(stored=True), contenido=TEXT(stored=True), nombrefichero=ID(stored=True))
def almacenar_datos(): schem = Schema(titulo=TEXT(stored=True), tituloOriginal=TEXT(stored=True), fechaEstrenoSpain=DATETIME(stored=True), paises=TEXT(stored=True), generos=TEXT(stored=True), director=TEXT(stored=True), sinopsis=TEXT(stored=True)) if os.path.exists("Index"): shutil.rmtree("Index") os.mkdir("Index") ix = create_in("Index", schema=schem) writer = ix.writer() i = 0 extraerDatos() for j in range(len(titulos)): writer.add_document(titulo=str(titulos[j]), tituloOriginal=str(titulosOriginales[j]), fechaEstrenoSpain=fechaEstrenoSpain[j], paises=str(paises[j]), generos=generos[j], director=str(directores[j]), sinopsis=str(sinopsis[j])) i += 1 writer.commit() messagebox.showinfo("Fin de indexado", "Se han indexado " + str(i) + " películas")
def create(): """ Create a new Whoosh index.. """ print 'creating new index in directory %s' % DIRECTORY os.system('rm -rf %s' % DIRECTORY) os.mkdir(DIRECTORY) schema = Schema(source=ID(stored=True, unique=True), cached=ID(stored=True, unique=True), hash=ID(stored=True, unique=True), title=TEXT(stored=True), author=TEXT(stored=True), year=TEXT(stored=True), notes=TEXT(stored=True), text=TEXT(stored=True), tags=TEXT(stored=True, analyzer=KeywordAnalyzer()), added=DATETIME(stored=True), mtime=DATETIME(stored=True)) create_in(DIRECTORY, schema, NAME)
class EventSchema(SchemaClass): id = NUMERIC(stored=True, unique=True) title = TEXT(stored=True, analyzer=StemmingAnalyzer(), field_boost=1.5) description = TEXT(stored=True, analyzer=StemmingAnalyzer(), field_boost=1.0) date = DATETIME(stored=True)
class PhotSchema(SchemaClass): id = ID(stored=True, unique=True) created = DATETIME() title = TEXT(analyzer=custom_analyzer, spelling=True) short_code = ID(stored=True, unique=True) disable = BOOLEAN() tags = TEXT(analyzer=tag_analyzer, spelling=True) user = ID()
def get_schema(): return Schema(title=TEXT(stored=True), link=TEXT(stored=True), autor=TEXT(stored=True), date=DATETIME(stored=True), respuestas=TEXT(stored=True), visitas=TEXT(stored=True), respuestasText=TEXT(stored=True))
def __init__(self, location): assert location self.schema = Schema(uuid=ID(stored=True, unique=True), sender=TEXT(stored=True), recipient=TEXT(stored=True), keywords=TEXT(stored=True), date=DATETIME(stored=True), repo_files=STORED) self.location = location
def make_schema(): return Schema( paper_field=KEYWORD(stored=True, lowercase=True, scorable=True), title=TEXT(stored=True, analyzer=StemmingAnalyzer()), authors=KEYWORD(stored=True, lowercase=True), pdf=ID(stored=True), abstract=TEXT(stored=True, analyzer=StemmingAnalyzer()), date=DATETIME(stored=True), )
def init_extensions(app): global use_cache whoosh_searcher.init_app(app) configure_uploads(app, upload_photos) mail.init_app(app) admin.init_app(app) mongo.init_app(app, "MONGO") oauth.init_app(app) login_manager.init_app(app) # use_cache = app.config.get('USE_CACHE', False) # if use_cache: # cache.init_app(app, {}) with app.app_context(): # 添加flask-admin视图 admin.add_view(admin_view.RolesModelView(mongo.db['roles'], '角色管理')) admin.add_view(admin_view.UsersModelView(mongo.db['users'], '用户管理')) admin.add_view( admin_view.CatalogsModelView(mongo.db['catalogs'], '栏目管理', category='内容管理')) admin.add_view( admin_view.PostsModelView(mongo.db['posts'], '帖子管理', category='内容管理')) admin.add_view( admin_view.PassagewaysModelView(mongo.db['passageways'], '温馨通道', category='推广管理')) admin.add_view( admin_view.FriendLinksModelView(mongo.db['friend_links'], '友链管理', category='推广管理')) admin.add_view( admin_view.PagesModelView(mongo.db['pages'], '页面管理', category='推广管理')) admin.add_view( admin_view.FooterLinksModelView(mongo.db['footer_links'], '底部链接', category='推广管理')) admin.add_view( admin_view.AdsModelView(mongo.db['ads'], '广告管理', category='推广管理')) admin.add_view(admin_view.OptionsModelView(mongo.db['options'], '系统设置')) # 初始化Whoosh索引 chinese_analyzer = ChineseAnalyzer() post_schema = Schema(obj_id=ID(unique=True, stored=True), title=TEXT(stored=True, analyzer=chinese_analyzer), content=TEXT(stored=True, analyzer=chinese_analyzer), create_at=DATETIME(stored=True), catalog_id=ID(stored=True), user_id=ID(stored=True)) whoosh_searcher.add_index('posts', post_schema)
def get_schema(): return Schema(numeroJornada=NUMERIC(stored=True), local=TEXT(stored=True), visitante=TEXT(stored=True), golesLocales=NUMERIC, golesVisitantes=NUMERIC, fecha=DATETIME(stored=True), autor=TEXT, titulo=TEXT(stored=True), cronica=TEXT)
def get_schema(): return Schema( id=NUMERIC(stored=True, unique=True), title=TEXT(sortable=True), content=TEXT(), asn=NUMERIC(sortable=True), correspondent=TEXT(sortable=True), correspondent_id=NUMERIC(), has_correspondent=BOOLEAN(), tag=KEYWORD(commas=True, scorable=True, lowercase=True), tag_id=KEYWORD(commas=True, scorable=True), has_tag=BOOLEAN(), type=TEXT(sortable=True), type_id=NUMERIC(), has_type=BOOLEAN(), created=DATETIME(sortable=True), modified=DATETIME(sortable=True), added=DATETIME(sortable=True), )
def get_schema(): analyzer = StemmingAnalyzer(stoplist=STOP) | StopFilter(stoplist=STOP) schema = Schema(title=TEXT(analyzer=analyzer, stored=True, sortable=True), content=TEXT(analyzer=analyzer, stored=True, sortable=True), tags=KEYWORD(commas=True, stored=True), author=TEXT(stored=True), uid=ID(unique=True, stored=True), lastedit_date=DATETIME(sortable=True, stored=True)) return schema
def get_schema(): return Schema(jornada=TEXT(stored=True), local=TEXT(stored=True), visitante=TEXT(stored=True), resultado=TEXT(stored=True), fecha=DATETIME(stored=True), autor=TEXT(stored=True), titular=TEXT(stored=True), titulo=TEXT(stored=True), texto=TEXT)
def build_schema(self, fields): schema_fields = { ID: WHOOSH_ID(stored=True, unique=True), DJANGO_CT: WHOOSH_ID(stored=True), DJANGO_ID: WHOOSH_ID(stored=True), } # Grab the number of keys that are hard-coded into Haystack. # We'll use this to (possibly) fail slightly more gracefully later. initial_key_count = len(schema_fields) content_field_name = '' for field_name, field_class in fields.items(): if field_class.is_multivalued: if field_class.indexed is False: schema_fields[field_class.index_fieldname] = IDLIST(stored=True, field_boost=field_class.boost) else: schema_fields[field_class.index_fieldname] = KEYWORD(stored=True, commas=True, scorable=True, field_boost=field_class.boost) elif field_class.field_type in ['date', 'datetime']: schema_fields[field_class.index_fieldname] = DATETIME(stored=field_class.stored, sortable=True) elif field_class.field_type == 'integer': schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, numtype=int, field_boost=field_class.boost) elif field_class.field_type == 'float': schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, numtype=float, field_boost=field_class.boost) elif field_class.field_type == 'boolean': # Field boost isn't supported on BOOLEAN as of 1.8.2. schema_fields[field_class.index_fieldname] = BOOLEAN(stored=field_class.stored) elif field_class.field_type == 'ngram': schema_fields[field_class.index_fieldname] = NGRAM(minsize=3, maxsize=15, stored=field_class.stored, field_boost=field_class.boost) elif field_class.field_type == 'edge_ngram': schema_fields[field_class.index_fieldname] = NGRAMWORDS(minsize=2, maxsize=15, at='start', stored=field_class.stored, field_boost=field_class.boost) else: # schema_fields[field_class.index_fieldname] = TEXT(stored=True, analyzer=StemmingAnalyzer(), field_boost=field_class.boost, sortable=True) # 中文分词 schema_fields[field_class.index_fieldname] = TEXT(stored=True, analyzer=ChineseAnalyzer(), field_boost=field_class.boost, sortable=True) if field_class.document is True: content_field_name = field_class.index_fieldname schema_fields[field_class.index_fieldname].spelling = True # Fail more gracefully than relying on the backend to die if no fields # are found. if len(schema_fields) <= initial_key_count: raise SearchBackendError( "No fields were found in any search_indexes. Please correct this before attempting to search.") return (content_field_name, Schema(**schema_fields))
class NoteSchema(SchemaClass): id = ID(stored=True, unique=True) created = DATETIME() title = TEXT(analyzer=custom_analyzer, spelling=True) contents = TEXT(spelling=True) public = BOOLEAN() draft = BOOLEAN() short_code = ID(stored=True, unique=True) disable = BOOLEAN() reported = BOOLEAN() tags = TEXT(analyzer=tag_analyzer, spelling=True) user = ID()
def get_index(): ixname = "search-index" if os.path.isdir(ixname): from whoosh.index import open_dir ix = open_dir(ixname) else: schema = Schema(title=TEXT(stored=True), url=ID(stored=True, unique=True), content=TEXT(stored=True), modified=DATETIME(sortable=True)) os.mkdir(ixname) ix = create_in(ixname, schema) return ix