Пример #1
0
    def __init__(self,
                 stored=False,
                 unique=False,
                 expression=None,
                 field_boost=1.0,
                 spelling=False):
        """
        :param stored: Whether the value of this field is stored with the
            document.
        :param unique: Whether the value of this field is unique per-document.
        :param expression: The regular expression object to use to extract
            tokens. The default expression breaks tokens on CRs, LFs, tabs,
            spaces, commas, and semicolons.
        """

        expression = expression or re.compile(r"[^\r\n\t ,;]+")
        self.analyzer = RegexAnalyzer(expression=expression)
        self.format = formats.Existence(field_boost=field_boost)
        self.stored = stored
        self.unique = unique
        self.spelling = spelling
#!/usr/bin/env
#coding:utf-8
from whoosh.fields import *
from whoosh.fields import Schema, STORED, ID, KEYWORD, TEXT
from whoosh.index import create_in
from whoosh.index import open_dir
from whoosh.qparser import QueryParser
import os.path
import whoosh
from whoosh.analysis import RegexAnalyzer
import csv

from whoosh import qparser
analyzer = RegexAnalyzer(r'([\u4e00-\u9fa5])|(\w+(\.?\w+)*)')  #中文语法分析器


def createIndexs(dirName):
    schema = Schema(id=NUMERIC(sortable=True),
                    views=KEYWORD(stored=True),
                    semtiment=TEXT(stored=True),
                    content=TEXT(stored=True, analyzer=analyzer))

    if not os.path.exists(dirName):
        os.mkdir(dirName)
    ix = create_in(dirName, schema)
    dic = {}
    for line in open('Test.csv'):
        id, content = line.split('\t')
        dic[id] = content
    writer = ix.writer()
    reader = csv.reader(open('result_bs.csv'))
Пример #3
0
        #
        num_added_records_so_far += 1
        if (num_added_records_so_far % 100 == 0):
            print(" num_added_records_so_far= " + str(num_added_records_so_far))
    #
    writer.commit()  # it is necessary to store the index once filled
    in_file.close()  # it is necessary to close the .csv file


'''
Here "schemas" function is used to create and fill all the schemas(indexes) for both .csv files (Cranfield.csv and Time.csv)

'''

analyzers = [StemmingAnalyzer(), StandardAnalyzer(), RegexAnalyzer(), SimpleAnalyzer(),
             FancyAnalyzer(), NgramAnalyzer(4), KeywordAnalyzer(), LanguageAnalyzer('en')] # all the analyzers that are used
analyzer_names = ['StemmingAnalyzer', 'StandardAnalyzer', 'RegexAnalyzer', 'SimpleAnalyzer',
                 'FancyAnalyzer', 'NgramAnalyzer', 'KeywordAnalyzer',  'LanguageAnalyzer'] # analyzers names

csv_names = ['Cranfield', 'Time'] # file names



# start to iterate over all the .csv files (in particular the only two that there are, Cranfield.csv, and Time.csv)
for name in csv_names: 
    
    print(name, '\n\n')
    
    path = "C:./"+name+"_DATASET" # get the path where the .csv is stored
    for e,type_analyzer in enumerate(analyzers): # now the iteration is necessary to create the 8 different inverted indexes
Пример #4
0
import time
import hashlib
from urllib import unquote
from searcher.models import config
from slutils import mysql_new
import os
import datetime
from searcher.models import formatURL
from whoosh.analysis import RegexAnalyzer
from whoosh.analysis import StandardAnalyzer
from jieba.analyse import ChineseAnalyzer
from whoosh.analysis import LanguageAnalyzer
from jieba.analyse import ChineseAnalyzer
from collections import defaultdict
analyzer_zhongwen = ChineseAnalyzer()
analyzer_pinyin = RegexAnalyzer()


def pub_rebuild():
    print datetime.datetime.now()
    print 'pub_rebuild'
    pub_db = mysql_new.BaseDB(config.MYSQL_DEFINE_PUB)
    schema = Schema(
        uid=ID(stored=True, unique=True),
        title=TEXT(stored=True, analyzer=analyzer_zhongwen),
        pinyin_title=TEXT(stored=True, analyzer=analyzer_pinyin),
        icon_url=ID(stored=True),
        description=STORED,
        v_status=NUMERIC(stored=True),
    )
    SQL = '''SELECT `uid`, `title`, `icon_url`, `description`, `v_status` FROM `pp_category_info`
Пример #5
0
 def __init__(self, stored = False, unique = False, expression = None):
     expression = expression or re.compile(r"[^\r\n\t ,;]+")
     analyzer = RegexAnalyzer(expression = expression)
     self.format = Existence(analyzer = analyzer)
     self.stored = stored
     self.unique = unique