示例#1
0
    def setUp(self):

        indexDir = os.path.join(System.getProperty("java.io.tmpdir", "tmp"),
                                "index-dir")

        cIndexDir = "%s-compound" % (indexDir)
        mIndexDir = "%s-multi" % (indexDir)
        self.rmdir(cIndexDir)
        self.rmdir(mIndexDir)

        self.cDir = SimpleFSDirectory(File(cIndexDir))
        self.mDir = SimpleFSDirectory(File(mIndexDir))
示例#2
0
    def setUp(self):

        fsIndexDir = os.path.join(System.getProperty("java.io.tmpdir", "tmp"),
                                  "fs-index")
        self.rmdir(fsIndexDir)
        self.ramDir = RAMDirectory()
        self.fsDir = SimpleFSDirectory(File(fsIndexDir))
示例#3
0
 def __init__(self):
     self.__dict__ = self.__shared_state
     if not self.__shared_state:
         self.jccvm = lucene.initVM()
         self.index = SimpleFSDirectory(
             lucene.File(settings.lucene_index_dir))
         self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
示例#4
0
def luceneRetriver(query):

    lucene.initVM()

    indir = SimpleFSDirectory(File(INDEXDIR))

    lucene_analyzer = StandardAnalyzer(Version.LUCENE_30)

    lucene_searcher = IndexSearcher(indir)

    my_query= QueryParser(Version.LUCENE_30,"text",\

    lucene_analyzer).parse(query)

    MAX = 1000

    total_hits = lucene_searcher.search(my_query, MAX)

    print "Hits: ", total_hits.totalHits

    for hit in total_hits.scoreDocs:

        print "Hit Score: ", hit.score, "Hit Doc:", hit.doc, "HitString:", hit.toString(
        )

        doc = lucene_searcher.doc(hit.doc)

        print doc.get("text").encode("utf-8")
 def __init__(self, location):
     lucene.initVM()
     directory = SimpleFSDirectory(File(location))
     self.reader = IndexReader.open(directory, True)
     self.searcher = IndexSearcher(self.reader)
     self.query_parser = QueryParser(Version.LUCENE_CURRENT, "text",
                                     WhitespaceAnalyzer())
示例#6
0
def luceneIndexer(contents):
    lucene.initVM()
    

    INDEXIDR= settings.INDEX_DIR

    indexdir= SimpleFSDirectory(File(INDEXIDR))
    
    analyzer= StandardAnalyzer(Version.LUCENE_30)

    index_writer= IndexWriter(indexdir,analyzer,True,\

    IndexWriter.MaxFieldLength(512))
    for tfile in contents:
        print"Indexing: ", tfile

        document= Document()

        content= tfile.getvalue()

        document.add(Field("text",content,Field.Store.YES,\
                           Field.Index.ANALYZED))
        index_writer.addDocument(document)
        print"Done: ", tfile
        index_writer.optimize()
        print index_writer.numDocs()
    index_writer.close()
示例#7
0
    def post(self):
      q= self.get_argument("query")

      # self.write(key)

    # def query(query):
      # query = self.get_argument("q")
      lucene.initVM()
      indexDir = "index"
      dir = SimpleFSDirectory(File(indexDir))
      analyzer = StandardAnalyzer(Version.LUCENE_30)
      searcher = IndexSearcher(dir)
      
      query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(q)
      MAX = 10
      hits = searcher.search(query, MAX)
      
      print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query)
      items = []
      rQ = []
      
      #for key, value in doc_urls.iteritems() 
       # print (key, value)

      for hit in hits.scoreDocs:
          #items.append({'score':hit.score, 'doc':hit.doc, 'blah':hit.toString(), 'url':doc_urls[str(hit.doc)]})
          print hit.score, hit.doc, hit.toString()
          print(len(doc_urls))
          items.append(doc_urls[str(hit.doc)])
          doc = searcher.doc(hit.doc) 
          print(hit.doc)
        
      self.render("index.html", title="Results", items=items, query=q)
示例#8
0
文件: app.py 项目: ProjectLISM/GUI
def names():
    lst = []

    search = "spax"  #request.form['product']
    lucene.initVM()

    dir = SimpleFSDirectory(File(indexDir))
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    searcher = IndexSearcher(dir)

    query = QueryParser(lucene.Version.LUCENE_CURRENT, "text",
                        analyzer).parse(search)
    MAX = 1000
    hits = searcher.search(query, MAX)

    print "Found %d document(s) that matched query '%s':" % (hits.totalHits,
                                                             query)

    for hit in hits.scoreDocs:
        if hit.score >= 1:
            print hit.score, hit.doc, hit.toString()
            doc = searcher.doc(hit.doc)
            print doc.get("text").encode("utf-8")
            items = doc.get("text").encode("utf-8").split(',')
            for item in items:
                if item == search:
                    pass
                elif item not in lst:
                    lst.append(item)
    #print lst
    data = {"products": lst}
    if request.method == 'POST':
        return jsonify(data)
    else:
        return jsonify(data)
示例#9
0
文件: app.py 项目: ProjectLISM/GUI
def configure_lucene():

    f = open('clique.txt', 'r')
    lucene.initVM()
    print 'Inside Function'
    #indexDir = "/tmp/luceneindex"
    dir = SimpleFSDirectory(File(indexDir))
    analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT)
    writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))

    print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs(
    )

    print >> sys.stderr, "Reading lines from sys.stdin..."
    for line in f:
        line = line.replace('\t', '')
        line = line.replace('\r', '')
        line = line.replace('\n', '')
        line = line.replace('^', '')
        line = line.strip()
        doc = Document()
        doc.add(Field("text", line, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

    print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (
        writer.numDocs())
    print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs(
    )
    writer.optimize()
    print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs(
    )
    print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs()
    writer.close()
示例#10
0
def init():
    global searcher, analyzer, vm
    vm = initVM()
    STORE_DIR = "index_qst"
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(directory, True)
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
示例#11
0
 def reindex(self):
     writer = IndexWriter(SimpleFSDirectory(File(self.corpus.path)), self.corpus.analyzer, False, IndexWriter.MaxFieldLength.LIMITED)
     indexutils.reindex_all(self.reader, writer, self.corpus.analyzer)
     writer.optimize()
     writer.close()
     self.parent.write({'message': "Reindex successful. Corpus analyzer is now set to %s." % (self.corpus.analyzer_str,)})
     self.parent.write({'status': "Ready!"})
    def setUp(self):

        indexDir = os.path.join(System.getProperty('java.io.tmpdir', 'tmp'),
                                'index-dir')
        self.rmdir(indexDir)
        self.dir = SimpleFSDirectory(File(indexDir))
        self.addDocuments(self.dir)
示例#13
0
def indexDocuments():
    # empty index directory
    indexDir = Wikipedia.directory + 'index/'
    for filename in os.listdir(indexDir):
        os.remove(indexDir + filename)

    # index documents
    lucene.initVM()
    version = Version.LUCENE_CURRENT
    analyzer = EnglishAnalyzer(version)
    writer = IndexWriter(SimpleFSDirectory(File(indexDir)), analyzer, True,
                         IndexWriter.MaxFieldLength.LIMITED)

    for article in Wikipedia():
        doc = Document()
        doc.add(
            Field('id', str(article['id'][0]), Field.Store.YES,
                  Field.Index.NOT_ANALYZED))
        doc.add(
            Field('title', article['url'], Field.Store.YES,
                  Field.Index.NOT_ANALYZED))
        doc.add(
            Field('content', article['text'], Field.Store.NO,
                  Field.Index.ANALYZED))
        writer.addDocument(doc)

    print 'Optimization'
    writer.optimize()
    writer.close()
示例#14
0
def retrieveDocs(q):
    lucene.initVM()
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    reader = IndexReader.open(SimpleFSDirectory(File("index/")))
    searcher = IndexSearcher(reader)

    query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(q)
    MAX = 1000
    hits = searcher.search(query, MAX)
    nonDiverse = []
    docsToScores = {}
    #create a list of html files with relevant websites
    rQ = []
    print "Found %d document(s) that matched query '%s':" % (hits.totalHits,
                                                             query)
    for hit in hits.scoreDocs:
        print hit.score, hit.doc, hit.toString()
        doc = searcher.doc(hit.doc)
        print doc.get("text").encode("utf-8")
        #print(new_urls[str(hit.doc)])
        result = str(hit.score) + " " + str(hit.doc) + " " + hit.toString()
        if (len(nonDiverse) < 10):
            nonDiverse.append(new_urls[str(hit.doc)])
        #find the document that corresponds to the html website and append to a list for min distance
        website = new_urls[str(hit.doc)]
        #html_files numbers of the hit websites added to rQ
        rQ.append(inv_map[website])
        docsToScores[int(inv_map[website])] = hit.score
        print(inv_map[website])
    return docsToScores, rQ, nonDiverse
示例#15
0
    def __init__(self):
        lucene.initVM()
        self._lversion = Version.LUCENE_30
        self._analyzer = EnglishAnalyzer(self._lversion)
        self._searcher = IndexSearcher(SimpleFSDirectory(File(self.indexDir)))

        self._translation = loadTranslation()
        self._links = loadLinks()
示例#16
0
    def _init_index(self):

        if not os.path.exists(self.corpus.path):
            os.mkdir(self.corpus.path)
        try:
            searcher = IndexSearcher(SimpleFSDirectory(File(self.corpus.path)), True)
        #except lucene.JavaError:
        except:
            analyzer = self.corpus.analyzer
            writer = IndexWriter(SimpleFSDirectory(File(self.corpus.path)), analyzer, True, IndexWriter.MaxFieldLength.LIMITED)
            writer.setMaxFieldLength(1048576)
            writer.optimize()
            writer.close()

        self.lucene_index = SimpleFSDirectory(File(self.corpus.path))
        self.searcher = IndexSearcher(self.lucene_index, True)
        self.reader = IndexReader.open(self.lucene_index, True)
        self.analyzer = self.corpus.analyzer
示例#17
0
def index_files(files, index_directory):
    lucene.initVM()
    d = SimpleFSDirectory(File(index_directory))
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    writer = IndexWriter(d, analyzer, True, IndexWriter.MaxFieldLength(512))
    for f in files:
        parse_file(f, writer)
    writer.optimize()
    writer.close()
示例#18
0
def init():
    global STORE_DIR, directory, searcher, analyzer, vm_env
    STORE_DIR = "index_lucene_v3_highlight"
    if (vm_env == None):
        vm_env = initVM(vmargs=['-Djava.awt.headless=true'])
    print 'lucene', VERSION
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(directory, True)
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
示例#19
0
def search(command):
    STORE_DIR = "index"
    vm_env = initVM()
    print 'lucene', VERSION
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(directory, True)
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    result = run(searcher, analyzer, command)
    searcher.close()
    return result
def SearchFiles(command):
    STORE_DIR = "lucene/index"
    getVMEnv().attachCurrentThread()
    # print 'lucene', VERSION
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(directory, True)
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    rankedfiles = run(searcher, analyzer, command)
    searcher.close()
    return rankedfiles
示例#21
0
def Searchfile(command, prior, page, RPP):
    STORE_DIR = "index_ans"
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(directory, True)
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
    store = run(searcher, analyzer, command, prior)
    searcher.close()
    start = (page - 1) * RPP
    end = start + RPP

    return store[start:end], len(store)
示例#22
0
 def __init__(self, rows=None):
     #lucene.initVM()
     # Django의 setttings.py 에 lucene.initVM() 설정 후 불러다 사용
     vm_env = lucene.getVMEnv()
     if vm_env == None:
         lucene.initVM()
     else:
         vm_env.attachCurrentThread()
     self.analyzer = lucene.StandardAnalyzer(Version.LUCENE_30)
     self.indexDir = SimpleFSDirectory(File(INDEX_DIRECTORY))
     self.rows = rows
示例#23
0
 def import_csv_with_content(self, csv_file, content_field):
     try:
         writer = IndexWriter(SimpleFSDirectory(File(self.corpus.path)), self.analyzer, False, IndexWriter.MaxFieldLength.LIMITED)
         changed_rows = addmetadata.add_metadata_and_content_from_csv(self.searcher, self.reader, writer, csv_file, content_field, self.args_dir)
         writer.close()
     except UnicodeDecodeError:
         try:
             writer.close()
         except:
             pass
         self.parent.write({'error': 'CSV import failed: file contained non-unicode characters. Please save the file with UTF-8 encoding and try again!'})
         return
     self.parent.write({'message': "CSV import complete: %s rows added." % (changed_rows,)})
示例#24
0
def search_lucene_index(search_params, index_dir, index_metadata,
                        records_per_page):
    """
    Uses the query term provided to search the disease ontology lucene index
    """
    results = []

    index_dir = SimpleFSDirectory(File(index_dir))
    analyzer = build_perfield_analyzer(index_metadata)
    searcher = IndexSearcher(index_dir)
    index_fields = index_metadata.keys()

    # Since we are paging results we want to grab what page we are on
    page = (int(search_params.get('page', 1))) - 1

    # Doing something pretty hacky here since we are trying to move from 0-based to 1
    # based indexing to match our pagingation display
    offset = int(page) * records_per_page

    # If we are executing an advanced search we will be building a BooleanQuery
    # in parts as opposed to the one MultiFieldQueryParser when doing a basic
    # serach
    query = None

    if search_params.get('adv_search') == "True":
        query = build_advanced_search_query(search_params,
                                            search_params.get('operator'),
                                            analyzer)
    else:
        parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, index_fields,
                                       analyzer)
        query = MultiFieldQueryParser.parse(
            parser, process_query_param((search_params.get('q'))))

    # Parse through our hits
    hits = searcher.search(query, 10000)
    total_hits = hits.totalHits
    count = min(hits.totalHits - offset, records_per_page)

    for i in xrange(0, count):
        score_doc = hits.scoreDocs[offset + i]
        doc = searcher.doc(score_doc.doc)
        term_id = doc.get('term id')
        name = doc.get('name')
        explain = searcher.explain(query, score_doc.doc)
        match_fields = get_field_matches(explain.toString(), index_fields)

        results.append((term_id, name, list(match_fields)))

    searcher.close()
    return (results, total_hits)
示例#25
0
def search_image(command):
    if command == ' ':
        return []
    Docs = []
    vm_env = getVMEnv()
    vm_env.attachCurrentThread()
    STORE_DIR = "index_img"

    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(directory, True)
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)

    command_dict = {}
    allowed_opt = ['site']
    opt = 'contents'
    for i in command.split(' '):
        if ':' in i:
            opt, value = i.split(':')[:2]
            opt = opt.lower()
            if opt in allowed_opt and value != '':
                command_dict[opt] = command_dict.get(opt, '') + ' ' + value
        else:
            seg_list = jieba.cut(i)
            command_dict[opt] = command_dict.get(opt,
                                                 '') + ' ' + " ".join(seg_list)

    querys = BooleanQuery()
    for k, v in command_dict.iteritems():
        if k == 'site':
            t = Term(k, '*' + v)
            query = WildcardQuery(t)
        else:
            query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
        querys.add(query, BooleanClause.Occur.MUST)

    scoreDocs = searcher.search(querys, 10000).scoreDocs
    formatter = SimpleHTMLFormatter("<font color=#FF0000>", "</font>")
    highlighter = Highlighter(formatter, QueryScorer(querys))

    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        doc_dic = {}
        doc_dic["url"] = doc.get("url")
        doc_dic["imgurl"] = doc.get("imgurl")
        doc_dic["urltitle"] = doc.get("urltitle")
        text = doc.get("contents")
        ts = analyzer.tokenStream(doc.get("contents"), StringReader(text))
        doc_dic["contents"] = highlighter.getBestFragments(ts, text, 2, "...")
        Docs.append(doc_dic)
    searcher.close()
    return Docs
    def createIndex(cls, dataDir, indexDir, useCompound):

        indexDir = SimpleFSDirectory(File(indexDir))
        writer = IndexWriter(indexDir,
                             StandardAnalyzer(Version.LUCENE_CURRENT), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)
        writer.setUseCompoundFile(useCompound)

        for dir, dirnames, filenames in os.walk(dataDir):
            for filename in filenames:
                if filename.endswith('.properties'):
                    cls.indexFile(writer, os.path.join(dir, filename), dataDir)

        writer.optimize()
        writer.close()
示例#27
0
def query(query):
    lucene.initVM()
    indexDir = "/Tmp/REMOVEME.index-dir"
    dir = SimpleFSDirectory(File(indexDir))
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    searcher = IndexSearcher(dir)

    query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(query)
    MAX = 1000
    hits = searcher.search(query, MAX)

    print "Found %d document(s) that matched query '%s':" % (hits.totalHits,
                                                             query)
    for hit in hits.scoreDocs:
        print hit.score, hit.doc, hit.toString(), doc_urls[str(hit.doc)]
        doc = searcher.doc(hit.doc)
示例#28
0
    def search(cls, indexDir, q):

        fsDir = SimpleFSDirectory(File(indexDir))
        searcher = IndexSearcher(fsDir, True)

        query = QueryParser(Version.LUCENE_CURRENT, "contents",
                            StandardAnalyzer(Version.LUCENE_CURRENT)).parse(q)
        start = time()
        hits = searcher.search(query, 50).scoreDocs
        duration = timedelta(seconds=time() - start)

        print "Found %d document(s) (in %s) that matched query '%s':" % (
            len(hits), duration, q)

        for hit in hits:
            doc = searcher.doc(hit.doc)
            print 'path:', doc.get("path")
示例#29
0
    def find(self, query, indir):
        lucene.initVM()
        INDEXDIR = indir

        indir = SimpleFSDirectory(File(INDEXDIR))
        lucene_analyzer = StandardAnalyzer(Version.LUCENE_30)
        lucene_searcher = IndexSearcher(indir)
        my_query = QueryParser(Version.LUCENE_30,"<default field>",\
        lucene_analyzer).parse("text:" + query + " OR title:" + query)
        MAX = 1000
        total_hits = lucene_searcher.search(my_query, MAX)
        print "\nHits: ", total_hits.totalHits, "\n"

        for hit in total_hits.scoreDocs:
            print "Hit Score:", "%.4f" % hit.score, "Department:", lucene_searcher.doc(
                hit.doc).get("department").encode(
                    "utf-8"), "Title:", lucene_searcher.doc(
                        hit.doc).get("title").encode("utf-8")
            print lucene_searcher.doc(hit.doc).get("url").encode("utf-8"), '\n'
示例#30
0
    def index(cls, indexDir, dataDir):

        if not (os.path.exists(dataDir) and os.path.isdir(dataDir)):
            raise IOError, "%s does not exist or is not a directory" % (
                dataDir)

        dir = SimpleFSDirectory(File(indexDir))
        writer = IndexWriter(dir, StandardAnalyzer(Version.LUCENE_CURRENT),
                             True, IndexWriter.MaxFieldLength.LIMITED)
        writer.setUseCompoundFile(False)

        cls.indexDirectory(writer, dataDir)

        numIndexed = writer.numDocs()
        writer.optimize()
        writer.close()
        dir.close()

        return numIndexed