Python StemmingAnalyzer примеры, whoosh.analysis.StemmingAnalyzer Python примеры использования

Пример #1

0

Показать файл

Файл: Schema.py Проект: clarkdatalabs/debate_analysis

def schema():

    schema = Schema(person=ID(stored=True),
        debate_no=TEXT(stored=True),
        sentiment_score=NUMERIC(stored=True, sortable=True),
        tags=KEYWORD(stored=True),
        sentence=TEXT(spelling=True, analyzer=StemmingAnalyzer(), stored=True))

    FIELD_KEYWORDS = 'keywords'
    FIELD_CONTENT = 'sentences'

    if not os.path.exists("index"):
        os.mkdir("index")
    ix = create_in("index", schema)



# create list of lists
    data = []
    for row in datareader:
        data.append(row)

# delete header
    del data[0]

# create list of dictionaries (using header terms as keys)
    transcript = []
    for row in data:
        dct = {}
        dct['party'] = row[0]
        dct['debateNo'] = row[1].decode('utf-8')
        dct['sentenceNo']=row[2]
        dct['sequenceNo']=row[3]
        dct['speaker']=row[4].decode('utf-8')
        dct['text']=row[5]
        transcript.append(dct)

# fix error in transcript for second Republican debate (WALKER's lines had been assigned to TRUMP or BUSH)
    for row in transcript:
        if row['party'] == 'rep' and row['debateNo']=='02' and row['text'].startswith('WALKER'):
            row['speaker'] = u'WALKER'
            text = bytearray(row['text'])
            del text[0:7]
            row['text'] = str(text)
        #print row

#for row in transcript:
    #print row

# encode sentences as unicode
    for row in transcript:
        row['text'] = row['text'].decode('utf-8')

    rep_speakers = ['CRUZ', 'RUBIO', 'KASICH', 'CARSON', 'FIORINA', 'PAUL', 'HUCKABEE', 'WALKER','TRUMP', 'CHRISTIE', 'BUSH']
    dem_speakers = ['CLINTON', 'SANDERS', 'CHAFEE', "O'MALLEY", 'WEBB']

# filtering out moderators
    transcript_no_moderators = []
    for row in transcript:
        if row['speaker'] in rep_speakers:
            transcript_no_moderators.append(row)
        if row['speaker'] in dem_speakers:
            transcript_no_moderators.append(row)

# Opening the index back up
    ix = open_dir("index")

# creating the testbatch
    testbatch=[]
    for row in transcript_no_moderators:
        testbatch.append(row)

    writer = ix.writer()
    for row in testbatch:
        writer.add_document(person=row['speaker'], debate_no =row['debateNo'], sentence=row['text'])
    writer.commit()
# sentiment score is already in the schema so calulate the sentiment score in this for loop and spit it back out

Пример #2

0

Показать файл

Файл: 全文搜索Whoosh.py Проект: jiji87432/hello-world

whoosh.fields.NGRAM
TBD.
专业的用户可以自己创造他们自己的field类型

Creating a Schema



[python] view plain copy
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
from whoosh.analysis import StemmingAnalyzer

schema = Schema(from_addr=ID(stored=True),
                to_addr=ID(stored=True),
                subject=TEXT(stored=True),
                body=TEXT(analyzer=StemmingAnalyzer()),
                tags=KEYWORD)

如果没有使用一个构造器的关键字参数，可以省略后面的括号，（例如fieldname=TEXT代替fieldname=TEXT()）Whoosh可以为你实例化

你也可以选择使用继承SchemaClass类来创建一个Schema类
[python] view plain copy
from whoosh.fields import SchemaClass, TEXT, KEYWORD, ID, STORED

class MySchema(SchemaClass):
        path = ID(stored=True)
        title = TEXT(stored=True)
        content = TEXT
        tags = KEYWORD
你可以给create_in()或者create_index()函数一个类作为参数而不是他的实例

Пример #3

0

Показать файл

Файл: models.py Проект: TheMeanCanEHdian/SaltToTaste

class Ingredient(db.Model):
    __searchable__ = ['name']
    __analyzer__ = StemmingAnalyzer()
    id = db.Column(db.Integer, primary_key=True)
    name = db.Column(db.String(250), nullable=False)

Пример #4

0

Показать файл

class BmarkSchema(SchemaClass):
    bid = ID(unique=True, stored=True)
    description = TEXT
    extended = TEXT
    tags = KEYWORD
    readable = TEXT(analyzer=StemmingAnalyzer())

Пример #5

0

Показать файл

    from jieba.analyse import ChineseAnalyzer
except Exception as err:
    print(repr(err))
    ChineseAnalyzer = None

SITE_CFG['LANG'] = SITE_CFG.get('LANG', 'zh')

# Using jieba lib for Chinese.
if SITE_CFG['LANG'] == 'zh' and ChineseAnalyzer:
    TOR_SCHEMA = Schema(title=TEXT(stored=True, analyzer=ChineseAnalyzer()),
                        catid=TEXT(stored=True),
                        type=TEXT(stored=True),
                        link=ID(unique=True, stored=True),
                        content=TEXT(stored=True, analyzer=ChineseAnalyzer()))
else:
    TOR_SCHEMA = Schema(title=TEXT(stored=True, analyzer=StemmingAnalyzer()),
                        catid=TEXT(stored=True),
                        type=TEXT(stored=True),
                        link=ID(unique=True, stored=True),
                        content=TEXT(stored=True, analyzer=StemmingAnalyzer()))

WHOOSH_BASE = 'database/whoosh'
if os.path.exists(WHOOSH_BASE):
    TOR_IDX = open_dir(WHOOSH_BASE)
else:
    os.makedirs(WHOOSH_BASE)
    TOR_IDX = create_in(WHOOSH_BASE, TOR_SCHEMA)


def do_for_app(rand=True, kind='', doc_type=None):
    '''

Пример #6

0

Показать файл

Файл: views.py Проект: ag-gipp/docker-receval

    def get_absolute_url(self):
        return reverse('recommendations') + '?seed=%s' % self.paper_id

    def get_title(self):
        return self.title

    def set_rank(self, rank):
        self.rank = rank
        return self


paper_schema = Schema(
    paper_id=ID(stored=True),
    title=TEXT(stored=True),
    abstract=TEXT(analyzer=StemmingAnalyzer()),
    paper_url=TEXT(),
    aspect_tasks=KEYWORD,
    aspect_methods=KEYWORD,
    aspect_datasets=KEYWORD,
)

if settings.ASPECT_KNN_WHOOSH_INDEX_PATH and os.path.exists(settings.ASPECT_KNN_WHOOSH_INDEX_PATH):
    ix = index.open_dir(settings.ASPECT_KNN_WHOOSH_INDEX_PATH)  #'/Users/maos01/Desktop/special-docembeds-release-files/output/pwc/whoosh_index'
else:
    ix = None


# Load vector models
generic_vecs = KeyedVectors.load_word2vec_format(settings.ASPECT_KNN_GENERIC_W2V_PATH, limit=settings.ASPECT_KNN_LIMIT) if settings.ASPECT_KNN_GENERIC_W2V_PATH and os.path.exists(settings.ASPECT_KNN_GENERIC_W2V_PATH) else None #  '/Users/maos01/Downloads/specter.1k.w2v.txt'
task_vecs = KeyedVectors.load_word2vec_format(settings.ASPECT_KNN_TASK_W2V_PATH, limit=settings.ASPECT_KNN_LIMIT) if settings.ASPECT_KNN_TASK_W2V_PATH and os.path.exists(settings.ASPECT_KNN_TASK_W2V_PATH) else None

Пример #7

0

Показать файл

Файл: task3.py Проект: AnneshaChowdhury/Semantic-Search-Engine

def main():
    file_content_doc1 = open("rural_min.txt").read()
    file_content_doc2 = open("science_min.txt").read()
    option = True
    while option:
        print("""
        1. Create Index.
        2. Query Index.
        3. Exit
        """)
        option = input("Please select an option...!")
        if option == "1":

            sent_tokenize_list1 = sent_tokenize(file_content_doc1,
                                                language='english')
            sent_tokenize_list2 = sent_tokenize(file_content_doc2,
                                                language='english')
            if not os.path.exists("index_task3_min"):
                os.mkdir("index_task3_min")

            my_analyzer = RegexTokenizer() | StopFilter() | LowercaseFilter(
            ) | Lemmatizer()
            pos_tagger = RegexTokenizer() | StopFilter() | LowercaseFilter(
            ) | PosTagger()
            wordnetsyn1 = RegexTokenizer() | StopFilter() | LowercaseFilter(
            ) | WordNetSynsets()
            wordnetsyn2 = RegexTokenizer() | StopFilter() | LowercaseFilter(
            ) | WordNetSynsets1()
            wordnetsyn3 = RegexTokenizer() | StopFilter() | LowercaseFilter(
            ) | WordNetSynsets2()
            wordnetsyn4 = RegexTokenizer() | StopFilter() | LowercaseFilter(
            ) | WordNetSynsets3()

            schema = Schema(id=ID(stored=True, unique=True),
                            standard=TEXT(stored=True,
                                          analyzer=StandardAnalyzer()),
                            stem_text=TEXT(stored=True,
                                           analyzer=StemmingAnalyzer()),
                            lemma=TEXT(stored=True, analyzer=my_analyzer),
                            pos_text=TEXT(stored=True, analyzer=pos_tagger),
                            hypernym=TEXT(stored=True, analyzer=wordnetsyn1),
                            hyponym=TEXT(stored=True, analyzer=wordnetsyn2),
                            holonym=TEXT(stored=True, analyzer=wordnetsyn3),
                            meronyms=TEXT(stored=True, analyzer=wordnetsyn4),
                            dependency=TEXT(analyzer=DependencyParser()))

            ix = index.create_in("index_task3_min", schema)
            writer = ix.writer()

            for sentence in sent_tokenize_list1:
                writer.add_document(standard=sentence,
                                    stem_text=sentence,
                                    lemma=sentence,
                                    pos_text=sentence,
                                    hypernym=sentence,
                                    hyponym=sentence,
                                    meronyms=sentence,
                                    holonym=sentence,
                                    dependency=sentence)
            for sentence in sent_tokenize_list2:
                writer.add_document(standard=sentence,
                                    stem_text=sentence,
                                    lemma=sentence,
                                    pos_text=sentence,
                                    hypernym=sentence,
                                    hyponym=sentence,
                                    meronyms=sentence,
                                    holonym=sentence,
                                    dependency=sentence)
            writer.commit()

            print_index_details(ix)

            print("\n\n Index created with various features as its fields")

        elif option == "2":
            ix = index.open_dir("index_task3")

            with ix.searcher(weighting=whoosh.scoring.BM25F()) as searcher:
                og = qparser.OrGroup.factory(0.5)
                q = input("\n Insert a query...!")
                query_text = MultifieldParser([
                    "standard", "stem_text", "lemma", "pos_text", "hyponym",
                    "meronyms", "hypernym", "holonym"
                ],
                                              schema=ix.schema,
                                              group=og).parse(q)
                results = searcher.search(query_text, limit=10)
                for i, hit in enumerate(results):
                    print(results.score(i), hit["standard"], sep=":")
                    print("\n")

        elif option == "3":
            print("\n Goodbye")
            sys.exit(0)
            option = None
        else:
            print("\n Not valid choice try again...!")

Пример #8

0

Показать файл

import os.path
from whoosh.index import create_in
from whoosh.fields import Schema, TEXT, NUMERIC
from whoosh.analysis import StemmingAnalyzer
from whoosh.lang.snowball import english
from whoosh.lang.porter2 import stem
# from whoosh.lang.paicehusk import PaiceHuskStemmer

# Use PorterStemmer2 Algorithm in indexing
stem_ana = StemmingAnalyzer(stemfn=stem)

schema = Schema(docID=NUMERIC(stored=True), contents=TEXT(analyzer=stem_ana))
index_dir = "index"

if not os.path.exists(index_dir):
    os.makedirs(index_dir)

ix = create_in(index_dir, schema)

writer = ix.writer()

with open('doc/document.txt', 'r') as f:
    text = f.read()
    docs = text.split('   /\n')[:-1]
    for doc in docs:
        br = doc.find('\n')
        docID = int(doc[:br])
        doc_text = doc[br + 1:]
        writer.add_document(docID=docID, contents=doc_text)

writer.commit()

Пример #9

0

Показать файл

Файл: models.py Проект: wcomartin/SaltToTaste

class Recipe(db.Model):
    __searchable__ = ['title', 'description', 'calories']
    __analyzer__ = StemmingAnalyzer()
    id = db.Column(db.Integer, primary_key=True)
    layout = db.Column(db.String(15))
    title = db.Column(db.String(100), unique=True, nullable=False)
    title_formatted = db.Column(db.String(100))
    filename = db.Column(db.String(100))
    image_path = db.Column(db.String(104))
    image_credit = db.Column(db.String(150))
    source = db.Column(db.String(150))
    description = db.Column(db.String(750))
    prep = db.Column(db.String(10))
    cook = db.Column(db.String(10))
    ready = db.Column(db.String(10))
    servings = db.Column(db.String(5))
    calories = db.Column(db.String(20))
    file_last_modified = db.Column(db.DateTime)

    tags = db.relationship('Tag',
                           secondary=recipe_tag,
                           lazy=True,
                           backref=db.backref('recipe', lazy=True))
    ingredients = db.relationship('Ingredient',
                                  secondary=recipe_ingredient,
                                  lazy=True,
                                  backref=db.backref('recipe', lazy=True))
    directions = db.relationship('Direction',
                                 secondary=recipe_direction,
                                 lazy=True,
                                 backref=db.backref('recipe', lazy=True))
    notes = db.relationship('Note',
                            secondary=recipe_note,
                            lazy=True,
                            backref=db.backref('recipe', lazy=True))

    def api_model(self):
        tags = []
        for tag in self.tags:
            tags.append(tag.name)

        ingredients = []
        for ingredient in self.ingredients:
            ingredients.append(ingredient.name)

        directions = []
        for direction in self.directions:
            directions.append(direction.name)

        notes = []
        for note in self.notes:
            notes.append(note.name)

        model = {
            'id': self.id,
            'layout': self.layout,
            'title': self.title,
            'title_formatted': self.title_formatted,
            'filename': self.filename,
            'image_path': self.image_path,
            'image_credit': self.image_credit,
            'source': self.source,
            'description': self.description,
            'prep': self.prep,
            'cook': self.cook,
            'ready': self.ready,
            'servings': self.servings,
            'calories': self.calories,
            'file_last_modified': self.file_last_modified,
            'tags': tags,
            'directions': directions,
            'ingredients': ingredients,
            'notes': notes
        }
        return model

    def __repr__(self):
        return f'<Recipe: {self.title}>'

Пример #10

0

Показать файл

def FragmenterAnalyzer():
    ret = StemmingAnalyzer(minsize=0, stoplist=None)
    return ret

Пример #11

0

Показать файл

Файл: index.py Проект: RKlazinga/information-retrieval-core

from whoosh.writing import AsyncWriter

parser = argparse.ArgumentParser()
parser.add_argument("data", type=str)
parser.add_argument("-num_docs", type=int, default=None)
parser.add_argument("-threads", type=int,
                    default=1)  # seems using more than 1 thread may be broken?
parser.add_argument("-reload", action="store_true")
parser.add_argument("-migrate_url_to_text_field", action="store_true")
args = parser.parse_args()

schema = Schema(
    docid=ID(stored=True),
    url=ID(stored=True),
    title=TEXT(stored=True,
               analyzer=StemmingAnalyzer()),  # maybe no stemming here?
    body=TEXT(analyzer=StemmingAnalyzer()),
)

index_dir = "data/msmarcoidx" if args.num_docs is None else "data/quickidx"
if not os.path.exists(index_dir):
    os.mkdir(index_dir)
    index.create_in(index_dir, schema)
    args.reload = True

storage = FileStorage(index_dir)
# Open an existing index
ix = storage.open_index()

if args.migrate_url_to_text_field:
    writer = ix.writer()

Пример #12

0

Показать файл

Файл: part_1_sol.py Проект: clabat9/DMT-HW1

def search_engine( analyzer = StemmingAnalyzer(), max_res = 150, multifield_flag = 1, \
                  only_title_flag = 0, \
                  directory_containing_the_index  = r"C:\Users\claba\Desktop\DMT works\HW_1\Index_part_1", \
                  query_dir = r"C:\Users\claba\Desktop\DMT works\HW_1\part_1\Cranfield_DATASET\cran_Queries.tsv", \
                  gt_dir = r"C:\Users\claba\Desktop\DMT works\HW_1\part_1\Cranfield_DATASET\cran_Ground_Truth.tsv", \
                  doc_dir = r"C:\Users\claba\Desktop\DMT works\HW_1\part_1\Cranfield_DATASET\DOCUMENTS\\", \
                  conf_label = "Not Specified",
                  mrr_eps = .32, \
                  k_interval_for_nDCG = range(1,151)):
   
    
    ###
    ### Create a Schema 
    ###
    schema = Schema(id=ID(stored=True), \
                    title = TEXT(stored=False, analyzer=analyzer),content=TEXT(stored=False, analyzer=analyzer))
    
    ###
    ### Create an empty-Index 
    ### according to the just defined Schema ;)
    ### 
    ix = create_in(directory_containing_the_index, schema)
    
    
    ###
    ### Get the query set (reset index due to missing values in the IDs)
    ###
    query_set = pd.read_csv(query_dir, engine = "python", sep = "\t", index_col="Query_ID").reset_index()
    
    
    ###
    ### Get the ground truth (little manipulation to group by query and allign IDs)
    ###
    gt_tmp = pd.read_csv(gt_dir, engine = "python", sep = "\t")
    gt_tmp = gt_tmp.groupby('Query_id')['Relevant_Doc_id'].apply(lambda x: x.tolist()).to_dict()
    gt = defaultdict(list)
    j = 1
    for i in range(len(gt_tmp)):
        while(gt[i] == []):
            try:
                gt[i] = gt_tmp[j]
                j+=1
            except KeyError:
                j += 1
    
    
    
    number_of_queries = len(query_set)
    num_of_docs = 1400
    
    ###
    ### We'll iterate on the following lists to swicth SE scoring function and get their names
    ###
    scoring_functions_list = [scoring.PL2(), scoring.Frequency(), scoring.BM25F(), scoring.TF_IDF()]
    scoring_name = [re.findall(r"(?<=scoring\.)[\w\W]*(?=object)", str(score))[0] for score in scoring_functions_list]
    
    
    ###
    ### Fill the Index
    ###
    writer = ix.writer()
    for doc in range(num_of_docs):
        id_ = str(doc+1)
        title,content = doc_retriver(doc_dir+"______"+str(doc+1)+".html")
        writer.add_document(id=id_, title = title, content = content)
    writer.commit()
    
    
    
    ###
    ### This """tensor""" allows to store all the results we need. It's dimension are #ResultsX#QueriesX#SE_config
    ###
    results_mat = np.zeros([max_res,number_of_queries,len(scoring_functions_list)])
    
   
    evaluations_summary = {} # Dict to store MRR and R-Precision Distro sumamries
    ndcg = defaultdict(list) # Def Dict that will contain nDCG values for varying K values for all MRR >.32 SEs

    ###
    ### Run the SEs
    ###
    for idx_s,scorer in enumerate(scoring_functions_list):
        for idx,query in enumerate(query_set["Query"]):
            
            input_query = query
            
            ###
            ### Select a Scoring-Function
            ###
            scoring_function = scorer
            
            ###
            ### Create a QueryParser for 
            ### parsing the input_query based on user SE choosen configuration.
            ###
            if multifield_flag:
                qp = MultifieldParser(["title","content"], ix.schema)
                parsed_query = qp.parse(input_query)# parsing the query
            else:
                if only_title_flag:
                    qp = SimpleParser("title", ix.schema)
                    parsed_query = qp.parse(input_query)# parsing the query
                else:
                    qp = SimpleParser("content", ix.schema)
                    parsed_query = qp.parse(input_query)# parsing the query
                
            ###
            ### Create a Searcher for the Index
            ### with the selected Scoring-Function 
            ###
            searcher = ix.searcher(weighting=scoring_function)
            
            ###
            ### Perform a Search and store results
            ###
            results = searcher.search(parsed_query, limit=max_res)
            results_mat[0:len(results),idx,idx_s] = [hit["id"] for hit in results]
            searcher.close()
        mrr_res = mrr(results_mat[:,:,idx_s],gt)
        
        if mrr_res >= mrr_eps:
            
            ###
            ### Compute and summarize R-precision distro
            ###
            r_res = r_precision(results_mat[:,:,idx_s],gt)
            mean = np.mean(list(r_res.values()))
            first_q = np.percentile(list(r_res.values()),25)
            third_q = np.percentile(list(r_res.values()),75)
            median = np.median(list(r_res.values()))
            minr = min(list(r_res.values()))
            maxr = max(list(r_res.values()))
            evaluations_summary[conf_label+","+scoring_name[idx_s]] = [mrr_res,mean,minr,first_q,median,third_q,maxr]
            
            ###
            ### Compute nDCG@k for varying k and for each scoring function
            ###
            for k in k_interval_for_nDCG:
                tmp_res = np.mean(list(nDCG(results_mat[:,:,idx_s],gt,k = k).values()))
                ndcg[conf_label+","+scoring_name[idx_s]].append(tmp_res)
            
        else:
            evaluations_summary[conf_label+","+scoring_name[idx_s]] = [mrr_res]
        
        ###
        ### Just to see what's happening
        ###
        print("Configuration:"+conf_label+","+scoring_name[idx_s]+"==> MRR = "+str(mrr_res))
        
    return evaluations_summary, ndcg # The evaluation result, obviously, contains oly MRR for <.32 SEs

Пример #13

0

Показать файл

# app.config['SECRET_KEY']='f6eeaa4486447025a35ab182035a34a0'
# app.config['SQLAlCHEMY_DATABASE_URI']='sqlite:///site.db'
# db = SQLAlchemy(app)

# from ed_main import routes

# converting your app into a package structure.
from flask import Flask
from flask_wtf.csrf import CSRFProtect, CSRFError
from flask_sqlalchemy import SQLAlchemy
from flask_bcrypt import Bcrypt
from whoosh.analysis import StemmingAnalyzer
import flask_whooshalchemy
from flask_admin import Admin

app = Flask(__name__)
csrf = CSRFProtect(app)
app.config['SECRET_KEY'] = 'f6eeaa4486447025a35ab182035a34a0'
app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///site.db'
app.config['SQLALCHEMY_TRACK_MODIFICATIONAS'] = True
app.config['WHOOSH_BASE'] = 'whoosh'
app.config['WHOOSH_ANALYZER'] = StemmingAnalyzer()
app.config['FLASK_ADMIN_SWATCH'] = 'cerulean'

db = SQLAlchemy(app)
bcrypt = Bcrypt(app)
admin = Admin(app, name='Project Ed', template_mode='bootstrap3')

from ed_main import routes
from ed_main import admin_views

Пример #14

0

Показать файл

def create_schema():
    schema = Schema(doc_text=TEXT(analyzer=StemmingAnalyzer(), stored=True))
    return schema

Пример #15

0

Показать файл

import os
import os.path
import datetime
from whoosh import index
from whoosh import query
from whoosh.fields import Schema, TEXT, ID, STORED, DATETIME, KEYWORD, NUMERIC
from whoosh.analysis import StemmingAnalyzer
from get_law_fields import list_from_file, get_fields
from whoosh.qparser import QueryParser, MultifieldParser
from whoosh.query import Query, Term, And

# import stopwords
with open("scripts/search_static/stopwords.txt", 'r') as f:
    stopwords = sorted(list(f.read().split('\n')))

lang_ana = StemmingAnalyzer(stoplist=stopwords)

# CREATE A SCHEMA
"""
The schema defines the fields that each document 
(i.e. law in most cases) may contain. 

law_name -- name of the document. Searchable and stored.
law_body -- the intro and articles of a law. Searchable only.
law_num_date -- the number of the law and the exact date. Searchable and stored.
pub_year -- the date of the Official Gazette publication.
article_one -- title and first few sentences of article one. Stored only for displaying in search results.

"""

schema = Schema(

Пример #16

0

Показать файл

Файл: createindex.py Проект: PacoCruz/CS410Project

"""
Created on Fri Dec  1 14:14:36 2017

@author: francruz

"""
import pandas as pd
import whoosh
import csv

# Creating the schema
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
from whoosh.analysis import StemmingAnalyzer
schema = Schema(asin=KEYWORD(stored=True, scorable=True, sortable=True),
                helpful=STORED,
                reviewText=TEXT(analyzer=StemmingAnalyzer(),
                                phrase=False,
                                stored=True),
                overall=TEXT(analyzer=StemmingAnalyzer(), phrase=False),
                reviewTime=ID(stored=True),
                title=TEXT(analyzer=StemmingAnalyzer(),
                           phrase=False,
                           stored=True),
                price=STORED,
                brand=KEYWORD(stored=True),
                reviewLength=STORED,
                reviewWords=STORED,
                avgWordLength=STORED,
                expresiveness=STORED,
                ratingDelta=STORED,
                priceDelta=STORED)

Пример #17

0

Показать файл

          imageURL text,
          price numeric,
          rating numeric,
          noOfReviews numeric,
          savings numeric,
          percentageSavings numeric,
          productDesc text,
          reviewPolarity numeric,
          countryOfOrigin text,
          overview text)''')
c.close()

# initialise sentic net
sn = SenticNet()
# does stemming, removes accents so you can match words like cafe, facade etc and removes stopwords
hsn_analyzer = StemmingAnalyzer() | CharsetFilter(accent_map) | StopFilter()

SCHEMA = Schema(
    filename=ID(unique=True, stored=True, analyzer=hsn_analyzer),
    content=TEXT(analyzer=hsn_analyzer, spelling=True),
    price=NUMERIC(sortable=True, stored=True),
    rating=NUMERIC(sortable=True, stored=True),
    noOfReviews=NUMERIC(sortable=True, stored=True),
    savings=NUMERIC(sortable=True, stored=True),
    percentageSavings=NUMERIC(sortable=True, stored=True),
    review=TEXT(analyzer=hsn_analyzer, spelling=True),
    productDesc=TEXT(stored=True),
    reviewPolarity=NUMERIC(sortable=True, stored=True),
    countryOfOrigin=TEXT(sortable=True, stored=True),
    overview=TEXT(stored=True),
)

Пример #18

0

Показать файл

from whoosh.index import create_in
from whoosh import index
from bs4 import BeautifulSoup
from whoosh import qparser
from whoosh.qparser import QueryParser
import re,os,codecs,sys

#Function that removes all the lines with tags style and script and document and head and title and also comments
def visible(element):
    if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
        return False
    elif re.match('<!--.*-->',element.encode('utf-8')):
        return False
    return True

schema = Schema(id=ID(stored=True),img=TEXT(stored=True),title=TEXT(stored=True),h1=TEXT(analyzer=StemmingAnalyzer()),content=TEXT(analyzer=StemmingAnalyzer(), stored = True))
dir = os.listdir('dataset')
ix = create_in("index", schema)
for i, l in enumerate(dir):
	p = "dataset/"+l
	html = codecs.open( p , "r", "utf-8" ).read()
	soup = BeautifulSoup(html, 'html.parser')
	tit = u''
	tit += soup.title.string
	imgs = soup.find('h1').find_all_next('img')[0]
	im = u'https:'
	im += imgs["src"]
	texts = soup.findAll(text=True)
	visible_texts = filter(visible, texts)
	div = soup.find("div", {"id": "content"})
	headers = div.find_all(['h1', 'h2', 'h3'])

Пример #19

0

Показать файл

Файл: whoosh_backend.py Проект: tawenga/kelis

import sqlalchemy
from inspect import isclass
from flask_sqlalchemy import models_committed
from sqlalchemy.ext.hybrid import hybrid_property
from sqlalchemy.inspection import inspect
from sqlalchemy.types import Boolean, Date, DateTime, Float, Integer, Text
from whoosh import index as whoosh_index
from whoosh.analysis import StemmingAnalyzer
from whoosh.fields import BOOLEAN, DATETIME, ID, NUMERIC, TEXT
from whoosh.fields import Schema as _Schema
from whoosh.qparser import AndGroup, MultifieldParser, OrGroup
from .backends import BaseBackend, logger, relation_column

DEFAULT_WHOOSH_INDEX_NAME = 'msearch'
DEFAULT_ANALYZER = StemmingAnalyzer()
DEFAULT_PRIMARY_KEY = 'id'

if sys.version_info[0] < 3:
    str = unicode


class Schema(object):
    def __init__(self, table, analyzer=None):
        self.table = table
        self.analyzer = analyzer
        self.schema = _Schema(**self.fields)

    @property
    def fields(self):
        model = self.table

Пример #20

0

Показать файл

import whoosh
import sqlalchemy
import flask_sqlalchemy
import whoosh.index
from whoosh import fields as whoosh_fields
from whoosh.analysis import StemmingAnalyzer
from whoosh.qparser import OrGroup, AndGroup, MultifieldParser
from whoosh.filedb.filestore import RamStorage
from whoosh.writing import AsyncWriter
from sqlalchemy import types as sql_types
from sqlalchemy.orm import EXT_CONTINUE

logger = logging.getLogger(__name__)

# DEFAULTS
DEFAULT_WHOOSH_ANALYZER = StemmingAnalyzer()
DEFAULT_WHOOSH_INDEX_PATH = os.path.join(os.path.abspath(os.getcwd()),
                                         '.indexes')

UPDATE_FIELDS = ('update', 'insert')
TEXT_TYPES = (sql_types.String, sql_types.Unicode, sql_types.Text)
DATE_TYPES = (sql_types.DateTime, sql_types.Date)
NUM_TYPES = (sql_types.Integer, sql_types.BigInteger, sql_types.SmallInteger,
             sql_types.Float, sql_types.Binary)


class WhooshAlchemyError(Exception):
    """ Base exception class for Flask-WhooshAlchemy3 """


class QueryProxy(flask_sqlalchemy.BaseQuery):

Пример #21

0

Показать файл

class WhooshConstants():
    index_dir = configuration.get('whoosh_index_dir')
    tokenized_analyzer = StandardAnalyzer(stoplist=None)
    normalized_analyzer = IDTokenizer() | SubstitutionFilter(
        r"[\s/,_'-]", "") | LowercaseFilter()
    stem_analyzer = StemmingAnalyzer(r"[\s/,_'-]", gaps=True, stoplist=None)

Пример #22

0

Показать файл

from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED, NGRAM
from whoosh.analysis import StemmingAnalyzer, NgramWordAnalyzer
from whoosh.qparser import QueryParser
from whoosh.index import create_in, open_dir

os.chdir(os.path.dirname(__file__))
ix_dir = os.path.join(os.getcwd(), 'dir_indices')

b_indexing = False
if b_indexing:
    if os.path.isdir(ix_dir):
        shutil.rmtree(ix_dir)
    os.mkdir(ix_dir)
    schema = Schema(code=ID(stored=True),
                    name=TEXT(analyzer=StemmingAnalyzer(), stored=True),
                    note=TEXT(analyzer=StemmingAnalyzer(), stored=True),
                    iid=ID(stored=True))

    ix = create_in(ix_dir, schema)
    writer = ix.writer()

    cn = sqlite3.connect('mycis.db')
    cr = cn.cursor()

    print('creating indices ...')
    start_time = time.time()
    for r in cr.execute('select * from diag').fetchall():
        iid, code, name, name_zh = r
        # remove . in icd10
        writer.add_document(code=code.replace('.', ''),

Пример #23

0

Показать файл

    def build_schema(self, fields):
        schema_fields = {
            ID: WHOOSH_ID(stored=True, unique=True),
            DJANGO_CT: WHOOSH_ID(stored=True),
            DJANGO_ID: WHOOSH_ID(stored=True),
        }
        # Grab the number of keys that are hard-coded into Haystack.
        # We'll use this to (possibly) fail slightly more gracefully later.
        initial_key_count = len(schema_fields)
        content_field_name = ''

        for field_name, field_class in fields.items():
            if field_class.is_multivalued:
                if field_class.indexed is False:
                    schema_fields[field_class.index_fieldname] = IDLIST(
                        stored=True, field_boost=field_class.boost)
                else:
                    schema_fields[field_class.index_fieldname] = KEYWORD(
                        stored=True,
                        commas=True,
                        scorable=True,
                        field_boost=field_class.boost)
            elif field_class.field_type in ['date', 'datetime']:
                schema_fields[field_class.index_fieldname] = DATETIME(
                    stored=field_class.stored)
            elif field_class.field_type == 'integer':
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored,
                    type=int,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'float':
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored,
                    type=float,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'boolean':
                # Field boost isn't supported on BOOLEAN as of 1.8.2.
                schema_fields[field_class.index_fieldname] = BOOLEAN(
                    stored=field_class.stored)
            elif field_class.field_type == 'ngram':
                schema_fields[field_class.index_fieldname] = NGRAM(
                    minsize=3,
                    maxsize=15,
                    stored=field_class.stored,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'edge_ngram':
                schema_fields[field_class.index_fieldname] = NGRAMWORDS(
                    minsize=2,
                    maxsize=15,
                    at='start',
                    stored=field_class.stored,
                    field_boost=field_class.boost)
            else:
                schema_fields[field_class.index_fieldname] = TEXT(
                    stored=True,
                    analyzer=StemmingAnalyzer(),
                    field_boost=field_class.boost)

            if field_class.document is True:
                content_field_name = field_class.index_fieldname

        # Fail more gracefully than relying on the backend to die if no fields
        # are found.
        if len(schema_fields) <= initial_key_count:
            raise SearchBackendError(
                "No fields were found in any search_indexes. Please correct this before attempting to search."
            )

        return (content_field_name, Schema(**schema_fields))

Пример #24

0

Показать файл

Файл: bookmark_index.py Проект: zhaoshe/alfred-workflow-chrome-bookmarks

from whoosh.support.charset import accent_map

BACKGROUND_JOB_KEY = "updateIndex"
UPDATE_INDEX_COMMAND = "update_index.py"
INDEX_PREFIX = "bookmarks-"
INDEXING_SETTING = "indexing"
CURRENT_INDEX_SETTING = "currentIndex"
INDEX_FRESH_CACHE = "freshIndex"
_N_GRAM_FIELD = "contentNGram"
_TEXT_FIELD = "contentText"
_CHILDREN_KEY = "children"

_BLUE_INDEX = "blue"
_GREEN_INDEX = "green"

_TEXT_ANALYZER = StemmingAnalyzer() | CharsetFilter(accent_map)
_N_GRAM_ANALYZER = analysis.NgramWordAnalyzer(minsize=2, maxsize=2)


class BookmarkSchema(fields.SchemaClass):
    contentNGram = TEXT(stored=False, analyzer=_N_GRAM_ANALYZER, phrase=False)
    contentText = TEXT(stored=False, analyzer=_TEXT_ANALYZER, phrase=True)
    urlSize = NUMERIC(signed=False, sortable=True, default=999)
    name = STORED()
    path = STORED()
    profile = STORED()
    url = STORED()
    icon = STORED()


class BookmarkIndex:

Пример #25

0

Показать файл

Файл: HelpIndex.py Проект: scottwedge/CrossMgr

def BuildHelpIndex():

    if os.path.exists(indexDir):
        shutil.rmtree(indexDir, ignore_errors=True)
    os.mkdir(indexDir)

    stemmingAnalyzer = StemmingAnalyzer()
    schema = Schema(path=ID(stored=True, unique=True),
                    section=TEXT(stored=True),
                    title=TEXT(stored=True, analyzer=stemmingAnalyzer),
                    level=NUMERIC(stored=True),
                    content=TEXT(stored=True, analyzer=stemmingAnalyzer))
    ix = create_in(indexDir, schema)
    writer = ix.writer()

    titleTags = set([u'h1', u'h2', u'h3', u'h4', u'h5'])

    newLines = re.compile('\n+')
    nonNumeric = re.compile(r'[^\d]')

    def addDocument(fname, section, lastTitle, textCur):
        # print u'addDocument: lastTitle={}'.format(lastTitle)
        if lastTitle and textCur:
            section = '|'.join(section) if section else lastTitle.get_text()
            # print u'Indexing: {}: {}'.format(os.path.basename(fname), section)
            content = newLines.sub(u'\n', u'\n'.join(textCur))
            writer.add_document(path=os.path.basename(fname) + u'#' +
                                lastTitle['id'],
                                title=lastTitle.get_text(),
                                section=section,
                                level=int(nonNumeric.sub(u'', lastTitle.name)),
                                content=content)

    # Extract content sections from the html pages.
    for f in glob.iglob(os.path.join(htmlDocDir, '*.html')):
        doc = BeautifulSoup(open(f).read(), 'html.parser')
        div = doc.find('div', class_='content')
        if not div:
            continue

        lastTitle = None
        textCur = []
        section = []
        for child in div.contents:
            try:
                tag = child.name
            except:
                tag = None

            if tag not in titleTags:
                try:
                    textCur.append(child.get_text())
                except:
                    pass
                continue

            addDocument(f, section, lastTitle, textCur)

            iSection = int(int(nonNumeric.sub('', tag))) - 1
            section = section[:iSection]
            section.append(child.get_text())

            lastTitle = child
            textCur = []

        addDocument(f, section, lastTitle, textCur)

    writer.commit()

Пример #26

0

Показать файл

Файл: index.py Проект: 0install/0mirror

# Copyright (C) 2013, Thomas Leonard
# See the COPYING file for details, or visit http://0install.net.
#
# This version for 0mirror is based on original code for 0install:
# http://thread.gmane.org/gmane.comp.file-systems.zero-install.devel/3847

import os
import logging

from whoosh.index import create_in, open_dir
from whoosh import fields
from whoosh.analysis import StemmingAnalyzer

from zeroinstall.injector.namespaces import XMLNS_IFACE

sa = StemmingAnalyzer()
schema = fields.Schema(uri=fields.ID(unique=True, stored=True),
                       baseuri=fields.KEYWORD(field_boost=10.0,
                                              lowercase=True),
                       name=fields.KEYWORD(stored=True,
                                           field_boost=50.0,
                                           lowercase=True),
                       summary=fields.TEXT(stored=True, field_boost=5.0),
                       description=fields.TEXT(analyzer=sa),
                       category=fields.KEYWORD(stored=True),
                       homepage=fields.STORED)


class Indexer:
    def __init__(self, config, index_dir):
        self.config = config

Пример #27

0

Показать файл

    def _process_results(self,
                         raw_page,
                         highlight=False,
                         query_string='',
                         spelling_query=None,
                         result_class=None):
        from haystack import connections
        results = []

        # It's important to grab the hits first before slicing. Otherwise, this
        # can cause pagination failures.
        hits = len(raw_page)

        if result_class is None:
            result_class = SearchResult

        facets = {}
        spelling_suggestion = None
        unified_index = connections[self.connection_alias].get_unified_index()
        indexed_models = unified_index.get_indexed_models()

        for doc_offset, raw_result in enumerate(raw_page):
            score = raw_page.score(doc_offset) or 0
            app_label, model_name = raw_result[DJANGO_CT].split('.')
            additional_fields = {}
            model = haystack_get_model(app_label, model_name)

            if model and model in indexed_models:
                for key, value in raw_result.items():
                    index = unified_index.get_index(model)
                    string_key = str(key)

                    if string_key in index.fields and hasattr(
                            index.fields[string_key], 'convert'):
                        # Special-cased due to the nature of KEYWORD fields.
                        if index.fields[string_key].is_multivalued:
                            if value is None or len(value) is 0:
                                additional_fields[string_key] = []
                            else:
                                additional_fields[string_key] = value.split(
                                    ',')
                        else:
                            additional_fields[string_key] = index.fields[
                                string_key].convert(value)
                    else:
                        additional_fields[string_key] = self._to_python(value)

                del (additional_fields[DJANGO_CT])
                del (additional_fields[DJANGO_ID])

                if highlight:
                    sa = StemmingAnalyzer()
                    formatter = WhooshHtmlFormatter('em')
                    terms = [token.text for token in sa(query_string)]

                    whoosh_result = whoosh_highlight(
                        additional_fields.get(self.content_field_name), terms,
                        sa, ContextFragmenter(), formatter)
                    additional_fields['highlighted'] = {
                        self.content_field_name: [whoosh_result],
                    }

                result = result_class(app_label, model_name,
                                      raw_result[DJANGO_ID], score,
                                      **additional_fields)
                results.append(result)
            else:
                hits -= 1

        if self.include_spelling:
            if spelling_query:
                spelling_suggestion = self.create_spelling_suggestion(
                    spelling_query)
            else:
                spelling_suggestion = self.create_spelling_suggestion(
                    query_string)

        return {
            'results': results,
            'hits': hits,
            'facets': facets,
            'spelling_suggestion': spelling_suggestion,
        }

Пример #28

0

Показать файл

class ObjectD(db.Model, BlogishBlob):
    __tablename__ = 'objectD'
    __searchable__ = ['title']
    __analyzer__ = StemmingAnalyzer() | DoubleMetaphoneFilter()

Пример #29

0

Показать файл

Файл: models.py Проект: TheMeanCanEHdian/SaltToTaste

class Direction(db.Model):
    __searchable__ = ['name']
    __analyzer__ = StemmingAnalyzer()
    id = db.Column(db.Integer, primary_key=True)
    name = db.Column(db.String(1000), nullable=False)

Пример #30

0

Показать файл

Файл: empty_index.py Проект: vedatk67/Whoosh_SE

from whoosh.index import create_in
from whoosh.fields import *
from whoosh.analysis import SimpleAnalyzer, StandardAnalyzer, StemmingAnalyzer, FancyAnalyzer

if len(sys.argv) != 3:
    sys.exit(
        '\nInputError: the user must enter an analyzer and the csv file path to index.\n'
        'EX: "SimpleAnalyzer ./part_1/Cranfield_DATASET/docs_table.csv"\n\n'
        'The user can choose from the following analyzer methods:\n\n'
        '"SimpleAnalyzer": it is a lower case filter\n\n'
        '"StandardAnalyzer": it is a lower case filter and  stop-words filter\n\n'
        '"StemmingAnalyzer": it is a lower case filter, stop-words filter and stemming filter\n\n'
        '"FancyAnalyzer": it is a lower case, stop-words, stemming filter and split words into subwords when it'
        'is useful\n')

with open(sys.argv[2], 'r') as csv_file:
    reader = csv.reader(csv_file, delimiter=' ')
    schema_fields = next(reader)[0].split(',')

if sys.argv[1] == 'SimpleAnalyzer': analyzer = SimpleAnalyzer()
elif sys.argv[1] == 'StandardAnalyzer': analyzer = StandardAnalyzer()
elif sys.argv[1] == 'StemmingAnalyzer': analyzer = StemmingAnalyzer()
elif sys.argv[1] == 'FancyAnalyzer': analyzer = FancyAnalyzer()

schema = Schema(id=ID(stored=True))
for field in schema_fields[1:]:
    schema.add(field, TEXT(stored=False, analyzer=analyzer))

index_dir = os.path.dirname(sys.argv[2]) + '/' + sys.argv[1]
os.mkdir(index_dir)
create_in(index_dir, schema)

Python StemmingAnalyzer примеры использования