def schema(): schema = Schema(person=ID(stored=True), debate_no=TEXT(stored=True), sentiment_score=NUMERIC(stored=True, sortable=True), tags=KEYWORD(stored=True), sentence=TEXT(spelling=True, analyzer=StemmingAnalyzer(), stored=True)) FIELD_KEYWORDS = 'keywords' FIELD_CONTENT = 'sentences' if not os.path.exists("index"): os.mkdir("index") ix = create_in("index", schema) # create list of lists data = [] for row in datareader: data.append(row) # delete header del data[0] # create list of dictionaries (using header terms as keys) transcript = [] for row in data: dct = {} dct['party'] = row[0] dct['debateNo'] = row[1].decode('utf-8') dct['sentenceNo']=row[2] dct['sequenceNo']=row[3] dct['speaker']=row[4].decode('utf-8') dct['text']=row[5] transcript.append(dct) # fix error in transcript for second Republican debate (WALKER's lines had been assigned to TRUMP or BUSH) for row in transcript: if row['party'] == 'rep' and row['debateNo']=='02' and row['text'].startswith('WALKER'): row['speaker'] = u'WALKER' text = bytearray(row['text']) del text[0:7] row['text'] = str(text) #print row #for row in transcript: #print row # encode sentences as unicode for row in transcript: row['text'] = row['text'].decode('utf-8') rep_speakers = ['CRUZ', 'RUBIO', 'KASICH', 'CARSON', 'FIORINA', 'PAUL', 'HUCKABEE', 'WALKER','TRUMP', 'CHRISTIE', 'BUSH'] dem_speakers = ['CLINTON', 'SANDERS', 'CHAFEE', "O'MALLEY", 'WEBB'] # filtering out moderators transcript_no_moderators = [] for row in transcript: if row['speaker'] in rep_speakers: transcript_no_moderators.append(row) if row['speaker'] in dem_speakers: transcript_no_moderators.append(row) # Opening the index back up ix = open_dir("index") # creating the testbatch testbatch=[] for row in transcript_no_moderators: testbatch.append(row) writer = ix.writer() for row in testbatch: writer.add_document(person=row['speaker'], debate_no =row['debateNo'], sentence=row['text']) writer.commit() # sentiment score is already in the schema so calulate the sentiment score in this for loop and spit it back out
whoosh.fields.NGRAM TBD. 专业的用户可以自己创造他们自己的field类型 Creating a Schema [python] view plain copy from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED from whoosh.analysis import StemmingAnalyzer schema = Schema(from_addr=ID(stored=True), to_addr=ID(stored=True), subject=TEXT(stored=True), body=TEXT(analyzer=StemmingAnalyzer()), tags=KEYWORD) 如果没有使用一个构造器的关键字参数,可以省略后面的括号,(例如fieldname=TEXT代替fieldname=TEXT())Whoosh可以为你实例化 你也可以选择使用继承SchemaClass类来创建一个Schema类 [python] view plain copy from whoosh.fields import SchemaClass, TEXT, KEYWORD, ID, STORED class MySchema(SchemaClass): path = ID(stored=True) title = TEXT(stored=True) content = TEXT tags = KEYWORD 你可以给create_in()或者create_index()函数一个类作为参数而不是他的实例
class Ingredient(db.Model): __searchable__ = ['name'] __analyzer__ = StemmingAnalyzer() id = db.Column(db.Integer, primary_key=True) name = db.Column(db.String(250), nullable=False)
class BmarkSchema(SchemaClass): bid = ID(unique=True, stored=True) description = TEXT extended = TEXT tags = KEYWORD readable = TEXT(analyzer=StemmingAnalyzer())
from jieba.analyse import ChineseAnalyzer except Exception as err: print(repr(err)) ChineseAnalyzer = None SITE_CFG['LANG'] = SITE_CFG.get('LANG', 'zh') # Using jieba lib for Chinese. if SITE_CFG['LANG'] == 'zh' and ChineseAnalyzer: TOR_SCHEMA = Schema(title=TEXT(stored=True, analyzer=ChineseAnalyzer()), catid=TEXT(stored=True), type=TEXT(stored=True), link=ID(unique=True, stored=True), content=TEXT(stored=True, analyzer=ChineseAnalyzer())) else: TOR_SCHEMA = Schema(title=TEXT(stored=True, analyzer=StemmingAnalyzer()), catid=TEXT(stored=True), type=TEXT(stored=True), link=ID(unique=True, stored=True), content=TEXT(stored=True, analyzer=StemmingAnalyzer())) WHOOSH_BASE = 'database/whoosh' if os.path.exists(WHOOSH_BASE): TOR_IDX = open_dir(WHOOSH_BASE) else: os.makedirs(WHOOSH_BASE) TOR_IDX = create_in(WHOOSH_BASE, TOR_SCHEMA) def do_for_app(rand=True, kind='', doc_type=None): '''
def get_absolute_url(self): return reverse('recommendations') + '?seed=%s' % self.paper_id def get_title(self): return self.title def set_rank(self, rank): self.rank = rank return self paper_schema = Schema( paper_id=ID(stored=True), title=TEXT(stored=True), abstract=TEXT(analyzer=StemmingAnalyzer()), paper_url=TEXT(), aspect_tasks=KEYWORD, aspect_methods=KEYWORD, aspect_datasets=KEYWORD, ) if settings.ASPECT_KNN_WHOOSH_INDEX_PATH and os.path.exists(settings.ASPECT_KNN_WHOOSH_INDEX_PATH): ix = index.open_dir(settings.ASPECT_KNN_WHOOSH_INDEX_PATH) #'/Users/maos01/Desktop/special-docembeds-release-files/output/pwc/whoosh_index' else: ix = None # Load vector models generic_vecs = KeyedVectors.load_word2vec_format(settings.ASPECT_KNN_GENERIC_W2V_PATH, limit=settings.ASPECT_KNN_LIMIT) if settings.ASPECT_KNN_GENERIC_W2V_PATH and os.path.exists(settings.ASPECT_KNN_GENERIC_W2V_PATH) else None # '/Users/maos01/Downloads/specter.1k.w2v.txt' task_vecs = KeyedVectors.load_word2vec_format(settings.ASPECT_KNN_TASK_W2V_PATH, limit=settings.ASPECT_KNN_LIMIT) if settings.ASPECT_KNN_TASK_W2V_PATH and os.path.exists(settings.ASPECT_KNN_TASK_W2V_PATH) else None
def main(): file_content_doc1 = open("rural_min.txt").read() file_content_doc2 = open("science_min.txt").read() option = True while option: print(""" 1. Create Index. 2. Query Index. 3. Exit """) option = input("Please select an option...!") if option == "1": sent_tokenize_list1 = sent_tokenize(file_content_doc1, language='english') sent_tokenize_list2 = sent_tokenize(file_content_doc2, language='english') if not os.path.exists("index_task3_min"): os.mkdir("index_task3_min") my_analyzer = RegexTokenizer() | StopFilter() | LowercaseFilter( ) | Lemmatizer() pos_tagger = RegexTokenizer() | StopFilter() | LowercaseFilter( ) | PosTagger() wordnetsyn1 = RegexTokenizer() | StopFilter() | LowercaseFilter( ) | WordNetSynsets() wordnetsyn2 = RegexTokenizer() | StopFilter() | LowercaseFilter( ) | WordNetSynsets1() wordnetsyn3 = RegexTokenizer() | StopFilter() | LowercaseFilter( ) | WordNetSynsets2() wordnetsyn4 = RegexTokenizer() | StopFilter() | LowercaseFilter( ) | WordNetSynsets3() schema = Schema(id=ID(stored=True, unique=True), standard=TEXT(stored=True, analyzer=StandardAnalyzer()), stem_text=TEXT(stored=True, analyzer=StemmingAnalyzer()), lemma=TEXT(stored=True, analyzer=my_analyzer), pos_text=TEXT(stored=True, analyzer=pos_tagger), hypernym=TEXT(stored=True, analyzer=wordnetsyn1), hyponym=TEXT(stored=True, analyzer=wordnetsyn2), holonym=TEXT(stored=True, analyzer=wordnetsyn3), meronyms=TEXT(stored=True, analyzer=wordnetsyn4), dependency=TEXT(analyzer=DependencyParser())) ix = index.create_in("index_task3_min", schema) writer = ix.writer() for sentence in sent_tokenize_list1: writer.add_document(standard=sentence, stem_text=sentence, lemma=sentence, pos_text=sentence, hypernym=sentence, hyponym=sentence, meronyms=sentence, holonym=sentence, dependency=sentence) for sentence in sent_tokenize_list2: writer.add_document(standard=sentence, stem_text=sentence, lemma=sentence, pos_text=sentence, hypernym=sentence, hyponym=sentence, meronyms=sentence, holonym=sentence, dependency=sentence) writer.commit() print_index_details(ix) print("\n\n Index created with various features as its fields") elif option == "2": ix = index.open_dir("index_task3") with ix.searcher(weighting=whoosh.scoring.BM25F()) as searcher: og = qparser.OrGroup.factory(0.5) q = input("\n Insert a query...!") query_text = MultifieldParser([ "standard", "stem_text", "lemma", "pos_text", "hyponym", "meronyms", "hypernym", "holonym" ], schema=ix.schema, group=og).parse(q) results = searcher.search(query_text, limit=10) for i, hit in enumerate(results): print(results.score(i), hit["standard"], sep=":") print("\n") elif option == "3": print("\n Goodbye") sys.exit(0) option = None else: print("\n Not valid choice try again...!")
import os.path from whoosh.index import create_in from whoosh.fields import Schema, TEXT, NUMERIC from whoosh.analysis import StemmingAnalyzer from whoosh.lang.snowball import english from whoosh.lang.porter2 import stem # from whoosh.lang.paicehusk import PaiceHuskStemmer # Use PorterStemmer2 Algorithm in indexing stem_ana = StemmingAnalyzer(stemfn=stem) schema = Schema(docID=NUMERIC(stored=True), contents=TEXT(analyzer=stem_ana)) index_dir = "index" if not os.path.exists(index_dir): os.makedirs(index_dir) ix = create_in(index_dir, schema) writer = ix.writer() with open('doc/document.txt', 'r') as f: text = f.read() docs = text.split(' /\n')[:-1] for doc in docs: br = doc.find('\n') docID = int(doc[:br]) doc_text = doc[br + 1:] writer.add_document(docID=docID, contents=doc_text) writer.commit()
class Recipe(db.Model): __searchable__ = ['title', 'description', 'calories'] __analyzer__ = StemmingAnalyzer() id = db.Column(db.Integer, primary_key=True) layout = db.Column(db.String(15)) title = db.Column(db.String(100), unique=True, nullable=False) title_formatted = db.Column(db.String(100)) filename = db.Column(db.String(100)) image_path = db.Column(db.String(104)) image_credit = db.Column(db.String(150)) source = db.Column(db.String(150)) description = db.Column(db.String(750)) prep = db.Column(db.String(10)) cook = db.Column(db.String(10)) ready = db.Column(db.String(10)) servings = db.Column(db.String(5)) calories = db.Column(db.String(20)) file_last_modified = db.Column(db.DateTime) tags = db.relationship('Tag', secondary=recipe_tag, lazy=True, backref=db.backref('recipe', lazy=True)) ingredients = db.relationship('Ingredient', secondary=recipe_ingredient, lazy=True, backref=db.backref('recipe', lazy=True)) directions = db.relationship('Direction', secondary=recipe_direction, lazy=True, backref=db.backref('recipe', lazy=True)) notes = db.relationship('Note', secondary=recipe_note, lazy=True, backref=db.backref('recipe', lazy=True)) def api_model(self): tags = [] for tag in self.tags: tags.append(tag.name) ingredients = [] for ingredient in self.ingredients: ingredients.append(ingredient.name) directions = [] for direction in self.directions: directions.append(direction.name) notes = [] for note in self.notes: notes.append(note.name) model = { 'id': self.id, 'layout': self.layout, 'title': self.title, 'title_formatted': self.title_formatted, 'filename': self.filename, 'image_path': self.image_path, 'image_credit': self.image_credit, 'source': self.source, 'description': self.description, 'prep': self.prep, 'cook': self.cook, 'ready': self.ready, 'servings': self.servings, 'calories': self.calories, 'file_last_modified': self.file_last_modified, 'tags': tags, 'directions': directions, 'ingredients': ingredients, 'notes': notes } return model def __repr__(self): return f'<Recipe: {self.title}>'
def FragmenterAnalyzer(): ret = StemmingAnalyzer(minsize=0, stoplist=None) return ret
from whoosh.writing import AsyncWriter parser = argparse.ArgumentParser() parser.add_argument("data", type=str) parser.add_argument("-num_docs", type=int, default=None) parser.add_argument("-threads", type=int, default=1) # seems using more than 1 thread may be broken? parser.add_argument("-reload", action="store_true") parser.add_argument("-migrate_url_to_text_field", action="store_true") args = parser.parse_args() schema = Schema( docid=ID(stored=True), url=ID(stored=True), title=TEXT(stored=True, analyzer=StemmingAnalyzer()), # maybe no stemming here? body=TEXT(analyzer=StemmingAnalyzer()), ) index_dir = "data/msmarcoidx" if args.num_docs is None else "data/quickidx" if not os.path.exists(index_dir): os.mkdir(index_dir) index.create_in(index_dir, schema) args.reload = True storage = FileStorage(index_dir) # Open an existing index ix = storage.open_index() if args.migrate_url_to_text_field: writer = ix.writer()
def search_engine( analyzer = StemmingAnalyzer(), max_res = 150, multifield_flag = 1, \ only_title_flag = 0, \ directory_containing_the_index = r"C:\Users\claba\Desktop\DMT works\HW_1\Index_part_1", \ query_dir = r"C:\Users\claba\Desktop\DMT works\HW_1\part_1\Cranfield_DATASET\cran_Queries.tsv", \ gt_dir = r"C:\Users\claba\Desktop\DMT works\HW_1\part_1\Cranfield_DATASET\cran_Ground_Truth.tsv", \ doc_dir = r"C:\Users\claba\Desktop\DMT works\HW_1\part_1\Cranfield_DATASET\DOCUMENTS\\", \ conf_label = "Not Specified", mrr_eps = .32, \ k_interval_for_nDCG = range(1,151)): ### ### Create a Schema ### schema = Schema(id=ID(stored=True), \ title = TEXT(stored=False, analyzer=analyzer),content=TEXT(stored=False, analyzer=analyzer)) ### ### Create an empty-Index ### according to the just defined Schema ;) ### ix = create_in(directory_containing_the_index, schema) ### ### Get the query set (reset index due to missing values in the IDs) ### query_set = pd.read_csv(query_dir, engine = "python", sep = "\t", index_col="Query_ID").reset_index() ### ### Get the ground truth (little manipulation to group by query and allign IDs) ### gt_tmp = pd.read_csv(gt_dir, engine = "python", sep = "\t") gt_tmp = gt_tmp.groupby('Query_id')['Relevant_Doc_id'].apply(lambda x: x.tolist()).to_dict() gt = defaultdict(list) j = 1 for i in range(len(gt_tmp)): while(gt[i] == []): try: gt[i] = gt_tmp[j] j+=1 except KeyError: j += 1 number_of_queries = len(query_set) num_of_docs = 1400 ### ### We'll iterate on the following lists to swicth SE scoring function and get their names ### scoring_functions_list = [scoring.PL2(), scoring.Frequency(), scoring.BM25F(), scoring.TF_IDF()] scoring_name = [re.findall(r"(?<=scoring\.)[\w\W]*(?=object)", str(score))[0] for score in scoring_functions_list] ### ### Fill the Index ### writer = ix.writer() for doc in range(num_of_docs): id_ = str(doc+1) title,content = doc_retriver(doc_dir+"______"+str(doc+1)+".html") writer.add_document(id=id_, title = title, content = content) writer.commit() ### ### This """tensor""" allows to store all the results we need. It's dimension are #ResultsX#QueriesX#SE_config ### results_mat = np.zeros([max_res,number_of_queries,len(scoring_functions_list)]) evaluations_summary = {} # Dict to store MRR and R-Precision Distro sumamries ndcg = defaultdict(list) # Def Dict that will contain nDCG values for varying K values for all MRR >.32 SEs ### ### Run the SEs ### for idx_s,scorer in enumerate(scoring_functions_list): for idx,query in enumerate(query_set["Query"]): input_query = query ### ### Select a Scoring-Function ### scoring_function = scorer ### ### Create a QueryParser for ### parsing the input_query based on user SE choosen configuration. ### if multifield_flag: qp = MultifieldParser(["title","content"], ix.schema) parsed_query = qp.parse(input_query)# parsing the query else: if only_title_flag: qp = SimpleParser("title", ix.schema) parsed_query = qp.parse(input_query)# parsing the query else: qp = SimpleParser("content", ix.schema) parsed_query = qp.parse(input_query)# parsing the query ### ### Create a Searcher for the Index ### with the selected Scoring-Function ### searcher = ix.searcher(weighting=scoring_function) ### ### Perform a Search and store results ### results = searcher.search(parsed_query, limit=max_res) results_mat[0:len(results),idx,idx_s] = [hit["id"] for hit in results] searcher.close() mrr_res = mrr(results_mat[:,:,idx_s],gt) if mrr_res >= mrr_eps: ### ### Compute and summarize R-precision distro ### r_res = r_precision(results_mat[:,:,idx_s],gt) mean = np.mean(list(r_res.values())) first_q = np.percentile(list(r_res.values()),25) third_q = np.percentile(list(r_res.values()),75) median = np.median(list(r_res.values())) minr = min(list(r_res.values())) maxr = max(list(r_res.values())) evaluations_summary[conf_label+","+scoring_name[idx_s]] = [mrr_res,mean,minr,first_q,median,third_q,maxr] ### ### Compute nDCG@k for varying k and for each scoring function ### for k in k_interval_for_nDCG: tmp_res = np.mean(list(nDCG(results_mat[:,:,idx_s],gt,k = k).values())) ndcg[conf_label+","+scoring_name[idx_s]].append(tmp_res) else: evaluations_summary[conf_label+","+scoring_name[idx_s]] = [mrr_res] ### ### Just to see what's happening ### print("Configuration:"+conf_label+","+scoring_name[idx_s]+"==> MRR = "+str(mrr_res)) return evaluations_summary, ndcg # The evaluation result, obviously, contains oly MRR for <.32 SEs
# app.config['SECRET_KEY']='f6eeaa4486447025a35ab182035a34a0' # app.config['SQLAlCHEMY_DATABASE_URI']='sqlite:///site.db' # db = SQLAlchemy(app) # from ed_main import routes # converting your app into a package structure. from flask import Flask from flask_wtf.csrf import CSRFProtect, CSRFError from flask_sqlalchemy import SQLAlchemy from flask_bcrypt import Bcrypt from whoosh.analysis import StemmingAnalyzer import flask_whooshalchemy from flask_admin import Admin app = Flask(__name__) csrf = CSRFProtect(app) app.config['SECRET_KEY'] = 'f6eeaa4486447025a35ab182035a34a0' app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///site.db' app.config['SQLALCHEMY_TRACK_MODIFICATIONAS'] = True app.config['WHOOSH_BASE'] = 'whoosh' app.config['WHOOSH_ANALYZER'] = StemmingAnalyzer() app.config['FLASK_ADMIN_SWATCH'] = 'cerulean' db = SQLAlchemy(app) bcrypt = Bcrypt(app) admin = Admin(app, name='Project Ed', template_mode='bootstrap3') from ed_main import routes from ed_main import admin_views
def create_schema(): schema = Schema(doc_text=TEXT(analyzer=StemmingAnalyzer(), stored=True)) return schema
import os import os.path import datetime from whoosh import index from whoosh import query from whoosh.fields import Schema, TEXT, ID, STORED, DATETIME, KEYWORD, NUMERIC from whoosh.analysis import StemmingAnalyzer from get_law_fields import list_from_file, get_fields from whoosh.qparser import QueryParser, MultifieldParser from whoosh.query import Query, Term, And # import stopwords with open("scripts/search_static/stopwords.txt", 'r') as f: stopwords = sorted(list(f.read().split('\n'))) lang_ana = StemmingAnalyzer(stoplist=stopwords) # CREATE A SCHEMA """ The schema defines the fields that each document (i.e. law in most cases) may contain. law_name -- name of the document. Searchable and stored. law_body -- the intro and articles of a law. Searchable only. law_num_date -- the number of the law and the exact date. Searchable and stored. pub_year -- the date of the Official Gazette publication. article_one -- title and first few sentences of article one. Stored only for displaying in search results. """ schema = Schema(
""" Created on Fri Dec 1 14:14:36 2017 @author: francruz """ import pandas as pd import whoosh import csv # Creating the schema from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED from whoosh.analysis import StemmingAnalyzer schema = Schema(asin=KEYWORD(stored=True, scorable=True, sortable=True), helpful=STORED, reviewText=TEXT(analyzer=StemmingAnalyzer(), phrase=False, stored=True), overall=TEXT(analyzer=StemmingAnalyzer(), phrase=False), reviewTime=ID(stored=True), title=TEXT(analyzer=StemmingAnalyzer(), phrase=False, stored=True), price=STORED, brand=KEYWORD(stored=True), reviewLength=STORED, reviewWords=STORED, avgWordLength=STORED, expresiveness=STORED, ratingDelta=STORED, priceDelta=STORED)
imageURL text, price numeric, rating numeric, noOfReviews numeric, savings numeric, percentageSavings numeric, productDesc text, reviewPolarity numeric, countryOfOrigin text, overview text)''') c.close() # initialise sentic net sn = SenticNet() # does stemming, removes accents so you can match words like cafe, facade etc and removes stopwords hsn_analyzer = StemmingAnalyzer() | CharsetFilter(accent_map) | StopFilter() SCHEMA = Schema( filename=ID(unique=True, stored=True, analyzer=hsn_analyzer), content=TEXT(analyzer=hsn_analyzer, spelling=True), price=NUMERIC(sortable=True, stored=True), rating=NUMERIC(sortable=True, stored=True), noOfReviews=NUMERIC(sortable=True, stored=True), savings=NUMERIC(sortable=True, stored=True), percentageSavings=NUMERIC(sortable=True, stored=True), review=TEXT(analyzer=hsn_analyzer, spelling=True), productDesc=TEXT(stored=True), reviewPolarity=NUMERIC(sortable=True, stored=True), countryOfOrigin=TEXT(sortable=True, stored=True), overview=TEXT(stored=True), )
from whoosh.index import create_in from whoosh import index from bs4 import BeautifulSoup from whoosh import qparser from whoosh.qparser import QueryParser import re,os,codecs,sys #Function that removes all the lines with tags style and script and document and head and title and also comments def visible(element): if element.parent.name in ['style', 'script', '[document]', 'head', 'title']: return False elif re.match('<!--.*-->',element.encode('utf-8')): return False return True schema = Schema(id=ID(stored=True),img=TEXT(stored=True),title=TEXT(stored=True),h1=TEXT(analyzer=StemmingAnalyzer()),content=TEXT(analyzer=StemmingAnalyzer(), stored = True)) dir = os.listdir('dataset') ix = create_in("index", schema) for i, l in enumerate(dir): p = "dataset/"+l html = codecs.open( p , "r", "utf-8" ).read() soup = BeautifulSoup(html, 'html.parser') tit = u'' tit += soup.title.string imgs = soup.find('h1').find_all_next('img')[0] im = u'https:' im += imgs["src"] texts = soup.findAll(text=True) visible_texts = filter(visible, texts) div = soup.find("div", {"id": "content"}) headers = div.find_all(['h1', 'h2', 'h3'])
import sqlalchemy from inspect import isclass from flask_sqlalchemy import models_committed from sqlalchemy.ext.hybrid import hybrid_property from sqlalchemy.inspection import inspect from sqlalchemy.types import Boolean, Date, DateTime, Float, Integer, Text from whoosh import index as whoosh_index from whoosh.analysis import StemmingAnalyzer from whoosh.fields import BOOLEAN, DATETIME, ID, NUMERIC, TEXT from whoosh.fields import Schema as _Schema from whoosh.qparser import AndGroup, MultifieldParser, OrGroup from .backends import BaseBackend, logger, relation_column DEFAULT_WHOOSH_INDEX_NAME = 'msearch' DEFAULT_ANALYZER = StemmingAnalyzer() DEFAULT_PRIMARY_KEY = 'id' if sys.version_info[0] < 3: str = unicode class Schema(object): def __init__(self, table, analyzer=None): self.table = table self.analyzer = analyzer self.schema = _Schema(**self.fields) @property def fields(self): model = self.table
import whoosh import sqlalchemy import flask_sqlalchemy import whoosh.index from whoosh import fields as whoosh_fields from whoosh.analysis import StemmingAnalyzer from whoosh.qparser import OrGroup, AndGroup, MultifieldParser from whoosh.filedb.filestore import RamStorage from whoosh.writing import AsyncWriter from sqlalchemy import types as sql_types from sqlalchemy.orm import EXT_CONTINUE logger = logging.getLogger(__name__) # DEFAULTS DEFAULT_WHOOSH_ANALYZER = StemmingAnalyzer() DEFAULT_WHOOSH_INDEX_PATH = os.path.join(os.path.abspath(os.getcwd()), '.indexes') UPDATE_FIELDS = ('update', 'insert') TEXT_TYPES = (sql_types.String, sql_types.Unicode, sql_types.Text) DATE_TYPES = (sql_types.DateTime, sql_types.Date) NUM_TYPES = (sql_types.Integer, sql_types.BigInteger, sql_types.SmallInteger, sql_types.Float, sql_types.Binary) class WhooshAlchemyError(Exception): """ Base exception class for Flask-WhooshAlchemy3 """ class QueryProxy(flask_sqlalchemy.BaseQuery):
class WhooshConstants(): index_dir = configuration.get('whoosh_index_dir') tokenized_analyzer = StandardAnalyzer(stoplist=None) normalized_analyzer = IDTokenizer() | SubstitutionFilter( r"[\s/,_'-]", "") | LowercaseFilter() stem_analyzer = StemmingAnalyzer(r"[\s/,_'-]", gaps=True, stoplist=None)
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED, NGRAM from whoosh.analysis import StemmingAnalyzer, NgramWordAnalyzer from whoosh.qparser import QueryParser from whoosh.index import create_in, open_dir os.chdir(os.path.dirname(__file__)) ix_dir = os.path.join(os.getcwd(), 'dir_indices') b_indexing = False if b_indexing: if os.path.isdir(ix_dir): shutil.rmtree(ix_dir) os.mkdir(ix_dir) schema = Schema(code=ID(stored=True), name=TEXT(analyzer=StemmingAnalyzer(), stored=True), note=TEXT(analyzer=StemmingAnalyzer(), stored=True), iid=ID(stored=True)) ix = create_in(ix_dir, schema) writer = ix.writer() cn = sqlite3.connect('mycis.db') cr = cn.cursor() print('creating indices ...') start_time = time.time() for r in cr.execute('select * from diag').fetchall(): iid, code, name, name_zh = r # remove . in icd10 writer.add_document(code=code.replace('.', ''),
def build_schema(self, fields): schema_fields = { ID: WHOOSH_ID(stored=True, unique=True), DJANGO_CT: WHOOSH_ID(stored=True), DJANGO_ID: WHOOSH_ID(stored=True), } # Grab the number of keys that are hard-coded into Haystack. # We'll use this to (possibly) fail slightly more gracefully later. initial_key_count = len(schema_fields) content_field_name = '' for field_name, field_class in fields.items(): if field_class.is_multivalued: if field_class.indexed is False: schema_fields[field_class.index_fieldname] = IDLIST( stored=True, field_boost=field_class.boost) else: schema_fields[field_class.index_fieldname] = KEYWORD( stored=True, commas=True, scorable=True, field_boost=field_class.boost) elif field_class.field_type in ['date', 'datetime']: schema_fields[field_class.index_fieldname] = DATETIME( stored=field_class.stored) elif field_class.field_type == 'integer': schema_fields[field_class.index_fieldname] = NUMERIC( stored=field_class.stored, type=int, field_boost=field_class.boost) elif field_class.field_type == 'float': schema_fields[field_class.index_fieldname] = NUMERIC( stored=field_class.stored, type=float, field_boost=field_class.boost) elif field_class.field_type == 'boolean': # Field boost isn't supported on BOOLEAN as of 1.8.2. schema_fields[field_class.index_fieldname] = BOOLEAN( stored=field_class.stored) elif field_class.field_type == 'ngram': schema_fields[field_class.index_fieldname] = NGRAM( minsize=3, maxsize=15, stored=field_class.stored, field_boost=field_class.boost) elif field_class.field_type == 'edge_ngram': schema_fields[field_class.index_fieldname] = NGRAMWORDS( minsize=2, maxsize=15, at='start', stored=field_class.stored, field_boost=field_class.boost) else: schema_fields[field_class.index_fieldname] = TEXT( stored=True, analyzer=StemmingAnalyzer(), field_boost=field_class.boost) if field_class.document is True: content_field_name = field_class.index_fieldname # Fail more gracefully than relying on the backend to die if no fields # are found. if len(schema_fields) <= initial_key_count: raise SearchBackendError( "No fields were found in any search_indexes. Please correct this before attempting to search." ) return (content_field_name, Schema(**schema_fields))
from whoosh.support.charset import accent_map BACKGROUND_JOB_KEY = "updateIndex" UPDATE_INDEX_COMMAND = "update_index.py" INDEX_PREFIX = "bookmarks-" INDEXING_SETTING = "indexing" CURRENT_INDEX_SETTING = "currentIndex" INDEX_FRESH_CACHE = "freshIndex" _N_GRAM_FIELD = "contentNGram" _TEXT_FIELD = "contentText" _CHILDREN_KEY = "children" _BLUE_INDEX = "blue" _GREEN_INDEX = "green" _TEXT_ANALYZER = StemmingAnalyzer() | CharsetFilter(accent_map) _N_GRAM_ANALYZER = analysis.NgramWordAnalyzer(minsize=2, maxsize=2) class BookmarkSchema(fields.SchemaClass): contentNGram = TEXT(stored=False, analyzer=_N_GRAM_ANALYZER, phrase=False) contentText = TEXT(stored=False, analyzer=_TEXT_ANALYZER, phrase=True) urlSize = NUMERIC(signed=False, sortable=True, default=999) name = STORED() path = STORED() profile = STORED() url = STORED() icon = STORED() class BookmarkIndex:
def BuildHelpIndex(): if os.path.exists(indexDir): shutil.rmtree(indexDir, ignore_errors=True) os.mkdir(indexDir) stemmingAnalyzer = StemmingAnalyzer() schema = Schema(path=ID(stored=True, unique=True), section=TEXT(stored=True), title=TEXT(stored=True, analyzer=stemmingAnalyzer), level=NUMERIC(stored=True), content=TEXT(stored=True, analyzer=stemmingAnalyzer)) ix = create_in(indexDir, schema) writer = ix.writer() titleTags = set([u'h1', u'h2', u'h3', u'h4', u'h5']) newLines = re.compile('\n+') nonNumeric = re.compile(r'[^\d]') def addDocument(fname, section, lastTitle, textCur): # print u'addDocument: lastTitle={}'.format(lastTitle) if lastTitle and textCur: section = '|'.join(section) if section else lastTitle.get_text() # print u'Indexing: {}: {}'.format(os.path.basename(fname), section) content = newLines.sub(u'\n', u'\n'.join(textCur)) writer.add_document(path=os.path.basename(fname) + u'#' + lastTitle['id'], title=lastTitle.get_text(), section=section, level=int(nonNumeric.sub(u'', lastTitle.name)), content=content) # Extract content sections from the html pages. for f in glob.iglob(os.path.join(htmlDocDir, '*.html')): doc = BeautifulSoup(open(f).read(), 'html.parser') div = doc.find('div', class_='content') if not div: continue lastTitle = None textCur = [] section = [] for child in div.contents: try: tag = child.name except: tag = None if tag not in titleTags: try: textCur.append(child.get_text()) except: pass continue addDocument(f, section, lastTitle, textCur) iSection = int(int(nonNumeric.sub('', tag))) - 1 section = section[:iSection] section.append(child.get_text()) lastTitle = child textCur = [] addDocument(f, section, lastTitle, textCur) writer.commit()
# Copyright (C) 2013, Thomas Leonard # See the COPYING file for details, or visit http://0install.net. # # This version for 0mirror is based on original code for 0install: # http://thread.gmane.org/gmane.comp.file-systems.zero-install.devel/3847 import os import logging from whoosh.index import create_in, open_dir from whoosh import fields from whoosh.analysis import StemmingAnalyzer from zeroinstall.injector.namespaces import XMLNS_IFACE sa = StemmingAnalyzer() schema = fields.Schema(uri=fields.ID(unique=True, stored=True), baseuri=fields.KEYWORD(field_boost=10.0, lowercase=True), name=fields.KEYWORD(stored=True, field_boost=50.0, lowercase=True), summary=fields.TEXT(stored=True, field_boost=5.0), description=fields.TEXT(analyzer=sa), category=fields.KEYWORD(stored=True), homepage=fields.STORED) class Indexer: def __init__(self, config, index_dir): self.config = config
def _process_results(self, raw_page, highlight=False, query_string='', spelling_query=None, result_class=None): from haystack import connections results = [] # It's important to grab the hits first before slicing. Otherwise, this # can cause pagination failures. hits = len(raw_page) if result_class is None: result_class = SearchResult facets = {} spelling_suggestion = None unified_index = connections[self.connection_alias].get_unified_index() indexed_models = unified_index.get_indexed_models() for doc_offset, raw_result in enumerate(raw_page): score = raw_page.score(doc_offset) or 0 app_label, model_name = raw_result[DJANGO_CT].split('.') additional_fields = {} model = haystack_get_model(app_label, model_name) if model and model in indexed_models: for key, value in raw_result.items(): index = unified_index.get_index(model) string_key = str(key) if string_key in index.fields and hasattr( index.fields[string_key], 'convert'): # Special-cased due to the nature of KEYWORD fields. if index.fields[string_key].is_multivalued: if value is None or len(value) is 0: additional_fields[string_key] = [] else: additional_fields[string_key] = value.split( ',') else: additional_fields[string_key] = index.fields[ string_key].convert(value) else: additional_fields[string_key] = self._to_python(value) del (additional_fields[DJANGO_CT]) del (additional_fields[DJANGO_ID]) if highlight: sa = StemmingAnalyzer() formatter = WhooshHtmlFormatter('em') terms = [token.text for token in sa(query_string)] whoosh_result = whoosh_highlight( additional_fields.get(self.content_field_name), terms, sa, ContextFragmenter(), formatter) additional_fields['highlighted'] = { self.content_field_name: [whoosh_result], } result = result_class(app_label, model_name, raw_result[DJANGO_ID], score, **additional_fields) results.append(result) else: hits -= 1 if self.include_spelling: if spelling_query: spelling_suggestion = self.create_spelling_suggestion( spelling_query) else: spelling_suggestion = self.create_spelling_suggestion( query_string) return { 'results': results, 'hits': hits, 'facets': facets, 'spelling_suggestion': spelling_suggestion, }
class ObjectD(db.Model, BlogishBlob): __tablename__ = 'objectD' __searchable__ = ['title'] __analyzer__ = StemmingAnalyzer() | DoubleMetaphoneFilter()
class Direction(db.Model): __searchable__ = ['name'] __analyzer__ = StemmingAnalyzer() id = db.Column(db.Integer, primary_key=True) name = db.Column(db.String(1000), nullable=False)
from whoosh.index import create_in from whoosh.fields import * from whoosh.analysis import SimpleAnalyzer, StandardAnalyzer, StemmingAnalyzer, FancyAnalyzer if len(sys.argv) != 3: sys.exit( '\nInputError: the user must enter an analyzer and the csv file path to index.\n' 'EX: "SimpleAnalyzer ./part_1/Cranfield_DATASET/docs_table.csv"\n\n' 'The user can choose from the following analyzer methods:\n\n' '"SimpleAnalyzer": it is a lower case filter\n\n' '"StandardAnalyzer": it is a lower case filter and stop-words filter\n\n' '"StemmingAnalyzer": it is a lower case filter, stop-words filter and stemming filter\n\n' '"FancyAnalyzer": it is a lower case, stop-words, stemming filter and split words into subwords when it' 'is useful\n') with open(sys.argv[2], 'r') as csv_file: reader = csv.reader(csv_file, delimiter=' ') schema_fields = next(reader)[0].split(',') if sys.argv[1] == 'SimpleAnalyzer': analyzer = SimpleAnalyzer() elif sys.argv[1] == 'StandardAnalyzer': analyzer = StandardAnalyzer() elif sys.argv[1] == 'StemmingAnalyzer': analyzer = StemmingAnalyzer() elif sys.argv[1] == 'FancyAnalyzer': analyzer = FancyAnalyzer() schema = Schema(id=ID(stored=True)) for field in schema_fields[1:]: schema.add(field, TEXT(stored=False, analyzer=analyzer)) index_dir = os.path.dirname(sys.argv[2]) + '/' + sys.argv[1] os.mkdir(index_dir) create_in(index_dir, schema)