Python SimpleFSDirectory.SimpleFSDirectory示例，lucene.SimpleFSDirectory.SimpleFSDirectory Python示例

示例#1

0

显示文件

    def setUp(self):

        indexDir = os.path.join(System.getProperty("java.io.tmpdir", "tmp"),
                                "index-dir")

        cIndexDir = "%s-compound" % (indexDir)
        mIndexDir = "%s-multi" % (indexDir)
        self.rmdir(cIndexDir)
        self.rmdir(mIndexDir)

        self.cDir = SimpleFSDirectory(File(cIndexDir))
        self.mDir = SimpleFSDirectory(File(mIndexDir))

示例#2

0

显示文件

    def setUp(self):

        fsIndexDir = os.path.join(System.getProperty("java.io.tmpdir", "tmp"),
                                  "fs-index")
        self.rmdir(fsIndexDir)
        self.ramDir = RAMDirectory()
        self.fsDir = SimpleFSDirectory(File(fsIndexDir))

示例#3

0

显示文件

文件： search.py 项目： JeffAMcGee/crowdy

 def __init__(self):
     self.__dict__ = self.__shared_state
     if not self.__shared_state:
         self.jccvm = lucene.initVM()
         self.index = SimpleFSDirectory(
             lucene.File(settings.lucene_index_dir))
         self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

示例#4

0

显示文件

def luceneRetriver(query):

    lucene.initVM()

    indir = SimpleFSDirectory(File(INDEXDIR))

    lucene_analyzer = StandardAnalyzer(Version.LUCENE_30)

    lucene_searcher = IndexSearcher(indir)

    my_query= QueryParser(Version.LUCENE_30,"text",\

    lucene_analyzer).parse(query)

    MAX = 1000

    total_hits = lucene_searcher.search(my_query, MAX)

    print "Hits: ", total_hits.totalHits

    for hit in total_hits.scoreDocs:

        print "Hit Score: ", hit.score, "Hit Doc:", hit.doc, "HitString:", hit.toString(
        )

        doc = lucene_searcher.doc(hit.doc)

        print doc.get("text").encode("utf-8")

示例#5

0

显示文件

文件： searchers.py 项目： myd1/information-retrieval-project

 def __init__(self, location):
     lucene.initVM()
     directory = SimpleFSDirectory(File(location))
     self.reader = IndexReader.open(directory, True)
     self.searcher = IndexSearcher(self.reader)
     self.query_parser = QueryParser(Version.LUCENE_CURRENT, "text",
                                     WhitespaceAnalyzer())

示例#6

0

显示文件

文件： pylucene_test.py 项目： SamChen1981/spider-1

def luceneIndexer(contents):
    lucene.initVM()
    

    INDEXIDR= settings.INDEX_DIR

    indexdir= SimpleFSDirectory(File(INDEXIDR))
    
    analyzer= StandardAnalyzer(Version.LUCENE_30)

    index_writer= IndexWriter(indexdir,analyzer,True,\

    IndexWriter.MaxFieldLength(512))
    for tfile in contents:
        print"Indexing: ", tfile

        document= Document()

        content= tfile.getvalue()

        document.add(Field("text",content,Field.Store.YES,\
                           Field.Index.ANALYZED))
        index_writer.addDocument(document)
        print"Done: ", tfile
        index_writer.optimize()
        print index_writer.numDocs()
    index_writer.close()

示例#7

0

显示文件

文件： frontend.py 项目： mefagan/artsearch

    def post(self):
      q= self.get_argument("query")

      # self.write(key)

    # def query(query):
      # query = self.get_argument("q")
      lucene.initVM()
      indexDir = "index"
      dir = SimpleFSDirectory(File(indexDir))
      analyzer = StandardAnalyzer(Version.LUCENE_30)
      searcher = IndexSearcher(dir)
      
      query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(q)
      MAX = 10
      hits = searcher.search(query, MAX)
      
      print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query)
      items = []
      rQ = []
      
      #for key, value in doc_urls.iteritems() 
       # print (key, value)

      for hit in hits.scoreDocs:
          #items.append({'score':hit.score, 'doc':hit.doc, 'blah':hit.toString(), 'url':doc_urls[str(hit.doc)]})
          print hit.score, hit.doc, hit.toString()
          print(len(doc_urls))
          items.append(doc_urls[str(hit.doc)])
          doc = searcher.doc(hit.doc) 
          print(hit.doc)
        
      self.render("index.html", title="Results", items=items, query=q)

示例#8

0

显示文件

文件： app.py 项目： ProjectLISM/GUI

def names():
    lst = []

    search = "spax"  #request.form['product']
    lucene.initVM()

    dir = SimpleFSDirectory(File(indexDir))
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    searcher = IndexSearcher(dir)

    query = QueryParser(lucene.Version.LUCENE_CURRENT, "text",
                        analyzer).parse(search)
    MAX = 1000
    hits = searcher.search(query, MAX)

    print "Found %d document(s) that matched query '%s':" % (hits.totalHits,
                                                             query)

    for hit in hits.scoreDocs:
        if hit.score >= 1:
            print hit.score, hit.doc, hit.toString()
            doc = searcher.doc(hit.doc)
            print doc.get("text").encode("utf-8")
            items = doc.get("text").encode("utf-8").split(',')
            for item in items:
                if item == search:
                    pass
                elif item not in lst:
                    lst.append(item)
    #print lst
    data = {"products": lst}
    if request.method == 'POST':
        return jsonify(data)
    else:
        return jsonify(data)

示例#9

0

显示文件

文件： app.py 项目： ProjectLISM/GUI

def configure_lucene():

    f = open('clique.txt', 'r')
    lucene.initVM()
    print 'Inside Function'
    #indexDir = "/tmp/luceneindex"
    dir = SimpleFSDirectory(File(indexDir))
    analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT)
    writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))

    print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs(
    )

    print >> sys.stderr, "Reading lines from sys.stdin..."
    for line in f:
        line = line.replace('\t', '')
        line = line.replace('\r', '')
        line = line.replace('\n', '')
        line = line.replace('^', '')
        line = line.strip()
        doc = Document()
        doc.add(Field("text", line, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

    print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (
        writer.numDocs())
    print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs(
    )
    writer.optimize()
    print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs(
    )
    print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs()
    writer.close()

示例#10

0

显示文件

文件： searcher_qst.py 项目： elicassion/Zhi-Searcher

def init():
    global searcher, analyzer, vm
    vm = initVM()
    STORE_DIR = "index_qst"
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(directory, True)
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)

示例#11

0

显示文件

文件： engine_withlucene.py 项目： pdwaggoner/txtorg

 def reindex(self):
     writer = IndexWriter(SimpleFSDirectory(File(self.corpus.path)), self.corpus.analyzer, False, IndexWriter.MaxFieldLength.LIMITED)
     indexutils.reindex_all(self.reader, writer, self.corpus.analyzer)
     writer.optimize()
     writer.close()
     self.parent.write({'message': "Reindex successful. Corpus analyzer is now set to %s." % (self.corpus.analyzer_str,)})
     self.parent.write({'status': "Ready!"})

示例#12

0

显示文件

文件： BaseIndexingTestCase.py 项目： lauromoraes/pylucene

    def setUp(self):

        indexDir = os.path.join(System.getProperty('java.io.tmpdir', 'tmp'),
                                'index-dir')
        self.rmdir(indexDir)
        self.dir = SimpleFSDirectory(File(indexDir))
        self.addDocuments(self.dir)

示例#13

0

显示文件

def indexDocuments():
    # empty index directory
    indexDir = Wikipedia.directory + 'index/'
    for filename in os.listdir(indexDir):
        os.remove(indexDir + filename)

    # index documents
    lucene.initVM()
    version = Version.LUCENE_CURRENT
    analyzer = EnglishAnalyzer(version)
    writer = IndexWriter(SimpleFSDirectory(File(indexDir)), analyzer, True,
                         IndexWriter.MaxFieldLength.LIMITED)

    for article in Wikipedia():
        doc = Document()
        doc.add(
            Field('id', str(article['id'][0]), Field.Store.YES,
                  Field.Index.NOT_ANALYZED))
        doc.add(
            Field('title', article['url'], Field.Store.YES,
                  Field.Index.NOT_ANALYZED))
        doc.add(
            Field('content', article['text'], Field.Store.NO,
                  Field.Index.ANALYZED))
        writer.addDocument(doc)

    print 'Optimization'
    writer.optimize()
    writer.close()

示例#14

0

显示文件

文件： luceneRetriever.py 项目： mefagan/artsearch

def retrieveDocs(q):
    lucene.initVM()
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    reader = IndexReader.open(SimpleFSDirectory(File("index/")))
    searcher = IndexSearcher(reader)

    query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(q)
    MAX = 1000
    hits = searcher.search(query, MAX)
    nonDiverse = []
    docsToScores = {}
    #create a list of html files with relevant websites
    rQ = []
    print "Found %d document(s) that matched query '%s':" % (hits.totalHits,
                                                             query)
    for hit in hits.scoreDocs:
        print hit.score, hit.doc, hit.toString()
        doc = searcher.doc(hit.doc)
        print doc.get("text").encode("utf-8")
        #print(new_urls[str(hit.doc)])
        result = str(hit.score) + " " + str(hit.doc) + " " + hit.toString()
        if (len(nonDiverse) < 10):
            nonDiverse.append(new_urls[str(hit.doc)])
        #find the document that corresponds to the html website and append to a list for min distance
        website = new_urls[str(hit.doc)]
        #html_files numbers of the hit websites added to rQ
        rQ.append(inv_map[website])
        docsToScores[int(inv_map[website])] = hit.score
        print(inv_map[website])
    return docsToScores, rQ, nonDiverse

示例#15

0

显示文件

文件： candidates.py 项目： Joozty/wikifier

    def __init__(self):
        lucene.initVM()
        self._lversion = Version.LUCENE_30
        self._analyzer = EnglishAnalyzer(self._lversion)
        self._searcher = IndexSearcher(SimpleFSDirectory(File(self.indexDir)))

        self._translation = loadTranslation()
        self._links = loadLinks()

示例#16

0

显示文件

文件： engine_withlucene.py 项目： pdwaggoner/txtorg

    def _init_index(self):

        if not os.path.exists(self.corpus.path):
            os.mkdir(self.corpus.path)
        try:
            searcher = IndexSearcher(SimpleFSDirectory(File(self.corpus.path)), True)
        #except lucene.JavaError:
        except:
            analyzer = self.corpus.analyzer
            writer = IndexWriter(SimpleFSDirectory(File(self.corpus.path)), analyzer, True, IndexWriter.MaxFieldLength.LIMITED)
            writer.setMaxFieldLength(1048576)
            writer.optimize()
            writer.close()

        self.lucene_index = SimpleFSDirectory(File(self.corpus.path))
        self.searcher = IndexSearcher(self.lucene_index, True)
        self.reader = IndexReader.open(self.lucene_index, True)
        self.analyzer = self.corpus.analyzer

示例#17

0

显示文件

文件： indexer.py 项目： CrawlingFingers/ConcordiaCrawler

def index_files(files, index_directory):
    lucene.initVM()
    d = SimpleFSDirectory(File(index_directory))
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    writer = IndexWriter(d, analyzer, True, IndexWriter.MaxFieldLength(512))
    for f in files:
        parse_file(f, writer)
    writer.optimize()
    writer.close()

示例#18

0

显示文件

def init():
    global STORE_DIR, directory, searcher, analyzer, vm_env
    STORE_DIR = "index_lucene_v3_highlight"
    if (vm_env == None):
        vm_env = initVM(vmargs=['-Djava.awt.headless=true'])
    print 'lucene', VERSION
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(directory, True)
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)

示例#19

0

显示文件

def search(command):
    STORE_DIR = "index"
    vm_env = initVM()
    print 'lucene', VERSION
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(directory, True)
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    result = run(searcher, analyzer, command)
    searcher.close()
    return result

示例#20

0

显示文件

文件： SearchFiles.py 项目： zhangcshcn/wse-basic-search-engine

def SearchFiles(command):
    STORE_DIR = "lucene/index"
    getVMEnv().attachCurrentThread()
    # print 'lucene', VERSION
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(directory, True)
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    rankedfiles = run(searcher, analyzer, command)
    searcher.close()
    return rankedfiles

示例#21

0

显示文件

def Searchfile(command, prior, page, RPP):
    STORE_DIR = "index_ans"
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(directory, True)
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
    store = run(searcher, analyzer, command, prior)
    searcher.close()
    start = (page - 1) * RPP
    end = start + RPP

    return store[start:end], len(store)

示例#22

0

显示文件

 def __init__(self, rows=None):
     #lucene.initVM()
     # Django의 setttings.py 에 lucene.initVM() 설정 후 불러다 사용
     vm_env = lucene.getVMEnv()
     if vm_env == None:
         lucene.initVM()
     else:
         vm_env.attachCurrentThread()
     self.analyzer = lucene.StandardAnalyzer(Version.LUCENE_30)
     self.indexDir = SimpleFSDirectory(File(INDEX_DIRECTORY))
     self.rows = rows

示例#23

0

显示文件

文件： engine_withlucene.py 项目： pdwaggoner/txtorg

 def import_csv_with_content(self, csv_file, content_field):
     try:
         writer = IndexWriter(SimpleFSDirectory(File(self.corpus.path)), self.analyzer, False, IndexWriter.MaxFieldLength.LIMITED)
         changed_rows = addmetadata.add_metadata_and_content_from_csv(self.searcher, self.reader, writer, csv_file, content_field, self.args_dir)
         writer.close()
     except UnicodeDecodeError:
         try:
             writer.close()
         except:
             pass
         self.parent.write({'error': 'CSV import failed: file contained non-unicode characters. Please save the file with UTF-8 encoding and try again!'})
         return
     self.parent.write({'message': "CSV import complete: %s rows added." % (changed_rows,)})

示例#24

0

显示文件

def search_lucene_index(search_params, index_dir, index_metadata,
                        records_per_page):
    """
    Uses the query term provided to search the disease ontology lucene index
    """
    results = []

    index_dir = SimpleFSDirectory(File(index_dir))
    analyzer = build_perfield_analyzer(index_metadata)
    searcher = IndexSearcher(index_dir)
    index_fields = index_metadata.keys()

    # Since we are paging results we want to grab what page we are on
    page = (int(search_params.get('page', 1))) - 1

    # Doing something pretty hacky here since we are trying to move from 0-based to 1
    # based indexing to match our pagingation display
    offset = int(page) * records_per_page

    # If we are executing an advanced search we will be building a BooleanQuery
    # in parts as opposed to the one MultiFieldQueryParser when doing a basic
    # serach
    query = None

    if search_params.get('adv_search') == "True":
        query = build_advanced_search_query(search_params,
                                            search_params.get('operator'),
                                            analyzer)
    else:
        parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, index_fields,
                                       analyzer)
        query = MultiFieldQueryParser.parse(
            parser, process_query_param((search_params.get('q'))))

    # Parse through our hits
    hits = searcher.search(query, 10000)
    total_hits = hits.totalHits
    count = min(hits.totalHits - offset, records_per_page)

    for i in xrange(0, count):
        score_doc = hits.scoreDocs[offset + i]
        doc = searcher.doc(score_doc.doc)
        term_id = doc.get('term id')
        name = doc.get('name')
        explain = searcher.explain(query, score_doc.doc)
        match_fields = get_field_matches(explain.toString(), index_fields)

        results.append((term_id, name, list(match_fields)))

    searcher.close()
    return (results, total_hits)

示例#25

0

显示文件

文件： websearch.py 项目： AlexChang/ImageSearcher

def search_image(command):
    if command == ' ':
        return []
    Docs = []
    vm_env = getVMEnv()
    vm_env.attachCurrentThread()
    STORE_DIR = "index_img"

    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(directory, True)
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)

    command_dict = {}
    allowed_opt = ['site']
    opt = 'contents'
    for i in command.split(' '):
        if ':' in i:
            opt, value = i.split(':')[:2]
            opt = opt.lower()
            if opt in allowed_opt and value != '':
                command_dict[opt] = command_dict.get(opt, '') + ' ' + value
        else:
            seg_list = jieba.cut(i)
            command_dict[opt] = command_dict.get(opt,
                                                 '') + ' ' + " ".join(seg_list)

    querys = BooleanQuery()
    for k, v in command_dict.iteritems():
        if k == 'site':
            t = Term(k, '*' + v)
            query = WildcardQuery(t)
        else:
            query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
        querys.add(query, BooleanClause.Occur.MUST)

    scoreDocs = searcher.search(querys, 10000).scoreDocs
    formatter = SimpleHTMLFormatter("<font color=#FF0000>", "</font>")
    highlighter = Highlighter(formatter, QueryScorer(querys))

    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        doc_dic = {}
        doc_dic["url"] = doc.get("url")
        doc_dic["imgurl"] = doc.get("imgurl")
        doc_dic["urltitle"] = doc.get("urltitle")
        text = doc.get("contents")
        ts = analyzer.tokenStream(doc.get("contents"), StringReader(text))
        doc_dic["contents"] = highlighter.getBestFragments(ts, text, 2, "...")
        Docs.append(doc_dic)
    searcher.close()
    return Docs

示例#26

0

显示文件

文件： TestDataDocumentHandler.py 项目： lauromoraes/pylucene

    def createIndex(cls, dataDir, indexDir, useCompound):

        indexDir = SimpleFSDirectory(File(indexDir))
        writer = IndexWriter(indexDir,
                             StandardAnalyzer(Version.LUCENE_CURRENT), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)
        writer.setUseCompoundFile(useCompound)

        for dir, dirnames, filenames in os.walk(dataDir):
            for filename in filenames:
                if filename.endswith('.properties'):
                    cls.indexFile(writer, os.path.join(dir, filename), dataDir)

        writer.optimize()
        writer.close()

示例#27

0

显示文件

def query(query):
    lucene.initVM()
    indexDir = "/Tmp/REMOVEME.index-dir"
    dir = SimpleFSDirectory(File(indexDir))
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    searcher = IndexSearcher(dir)

    query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(query)
    MAX = 1000
    hits = searcher.search(query, MAX)

    print "Found %d document(s) that matched query '%s':" % (hits.totalHits,
                                                             query)
    for hit in hits.scoreDocs:
        print hit.score, hit.doc, hit.toString(), doc_urls[str(hit.doc)]
        doc = searcher.doc(hit.doc)

示例#28

0

显示文件

    def search(cls, indexDir, q):

        fsDir = SimpleFSDirectory(File(indexDir))
        searcher = IndexSearcher(fsDir, True)

        query = QueryParser(Version.LUCENE_CURRENT, "contents",
                            StandardAnalyzer(Version.LUCENE_CURRENT)).parse(q)
        start = time()
        hits = searcher.search(query, 50).scoreDocs
        duration = timedelta(seconds=time() - start)

        print "Found %d document(s) (in %s) that matched query '%s':" % (
            len(hits), duration, q)

        for hit in hits:
            doc = searcher.doc(hit.doc)
            print 'path:', doc.get("path")

示例#29

0

显示文件

    def find(self, query, indir):
        lucene.initVM()
        INDEXDIR = indir

        indir = SimpleFSDirectory(File(INDEXDIR))
        lucene_analyzer = StandardAnalyzer(Version.LUCENE_30)
        lucene_searcher = IndexSearcher(indir)
        my_query = QueryParser(Version.LUCENE_30,"<default field>",\
        lucene_analyzer).parse("text:" + query + " OR title:" + query)
        MAX = 1000
        total_hits = lucene_searcher.search(my_query, MAX)
        print "\nHits: ", total_hits.totalHits, "\n"

        for hit in total_hits.scoreDocs:
            print "Hit Score:", "%.4f" % hit.score, "Department:", lucene_searcher.doc(
                hit.doc).get("department").encode(
                    "utf-8"), "Title:", lucene_searcher.doc(
                        hit.doc).get("title").encode("utf-8")
            print lucene_searcher.doc(hit.doc).get("url").encode("utf-8"), '\n'

示例#30

0

显示文件

    def index(cls, indexDir, dataDir):

        if not (os.path.exists(dataDir) and os.path.isdir(dataDir)):
            raise IOError, "%s does not exist or is not a directory" % (
                dataDir)

        dir = SimpleFSDirectory(File(indexDir))
        writer = IndexWriter(dir, StandardAnalyzer(Version.LUCENE_CURRENT),
                             True, IndexWriter.MaxFieldLength.LIMITED)
        writer.setUseCompoundFile(False)

        cls.indexDirectory(writer, dataDir)

        numIndexed = writer.numDocs()
        writer.optimize()
        writer.close()
        dir.close()

        return numIndexed