예제 #1
0
 def __init__(self, indexDir):
     self.directory = SimpleFSDirectory(Paths.get(indexDir))
     self.searcher = IndexSearcher(DirectoryReader.open(self.directory))
     self.nameQueryParser = QueryParser('name', StandardAnalyzer())
     self.nameQueryParser.setDefaultOperator(QueryParser.Operator.AND)
     self.idQueryParser = QueryParser('id', StandardAnalyzer())
     self.idQueryParser.setDefaultOperator(QueryParser.Operator.AND)
예제 #2
0
 def GET(self):
     user_data = web.input()
     message = user_data.keyword
     if len(message) > 10:
         if (len(message) > 3
                 and message[-3] + message[-2] + message[-1] == 'png'
                 or message[-3] + message[-2] + message[-1] == 'jpg'):
             urlretrieve(message, 'target.jpg')
             lis1 = shit.LSH('target.jpg')
             lis = []
             vm_env.attachCurrentThread()
             STORE_DIR = 'index'
             directory = SimpleFSDirectory(File(STORE_DIR))
             searcher = IndexSearcher(DirectoryReader.open(directory))
             analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
             for i in range(len(lis1)):
                 lis.append(run(searcher, analyzer, lis1[i])[0])
     else:
         a = func(user_data.keyword)
         STORE_DIR = 'index'
         vm_env.attachCurrentThread()
         directory = SimpleFSDirectory(File(STORE_DIR))
         searcher = IndexSearcher(DirectoryReader.open(directory))
         analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
         lis = run(searcher, analyzer, a)
     f = login
     return render.movies(f, lis)
예제 #3
0
 def search(self, field: str):
     sear = self._search
     if len(self._commandInfo.getKey()) == 0 or self._commandInfo.getKey()[0] in ['-', '~']:
         query = QueryParser(field, StandardAnalyzer()).parse(make_parser(self._commandInfo.getWordList()[0]))
     elif self._commandInfo.getKey()[0] == '#':
         query1 = QueryParser(field, StandardAnalyzer()).parse(make_parser(self._commandInfo.getWordList()[0]))
         query2 = QueryParser(field, StandardAnalyzer()).parse(make_parser(self._commandInfo.getWordList()[1]))
         bc1 = BooleanClause(query1, BooleanClause.Occur.MUST)
         bc2 = BooleanClause(query2, BooleanClause.Occur.MUST)
         query = BooleanQuery.Builder().add(bc1).add(bc2).build()
     elif self._commandInfo.getKey()[0] in ['$', '+']:
         bq = BooleanQuery.Builder()
         for w in self._commandInfo.getWordList():
             queryx = QueryParser(field, StandardAnalyzer()).parse(make_parser(w))
             bc = BooleanClause(queryx, BooleanClause.Occur.MUST)
             bq.add(bc)
         query = bq.build()
     else:
         query = ''
     hits = sear.search(query, 999999)
     for hit in hits.scoreDocs:
         doc = sear.doc(hit.doc)
         res = doc.get(field)
         id = doc.get(field+'_id')
         if doc_hit(res, self._commandInfo):
             sentences = re.split('[!?!?。]', res)
             map(lambda x: sentences.pop(x) if x == '' else 0, range(len(sentences)))
             for sentence in sentences:
                 if key_filter(self._commandInfo, sentence):
                     self._doc[id] = res
                     self._resultSentencesList.append((id, sentence))
     return self
예제 #4
0
 def __init__(self, fs_directory):
     directory = SimpleFSDirectory(Paths.get(fs_directory))
     self.index_reader = DirectoryReader.open(directory)
     self.searcher = IndexSearcher(DirectoryReader.open(directory))
     self.analyzer = StandardAnalyzer()
     self.query = None
     self.lucene_dictionary = LuceneDictionary(self.index_reader,
                                               'contents')
     self.analyzer = StandardAnalyzer()
     self.formatter = SimpleHTMLFormatter()
     self.hits = None
예제 #5
0
def retrival_answer(MAX):
    lucene.initVM()
    directory = RAMDirectory()

    indexDir = SimpleFSDirectory(Paths.get('index'))
    writerConfig = IndexWriterConfig(StandardAnalyzer())
    writer = IndexWriter(directory, writerConfig)

    print "%d docs in index" % writer.numDocs()
    print "Reading lines from Document..."

    process_doc = open("Huawei_result/document.txt", "r")
    doc_line = process_doc.readlines()
    for l in doc_line:
        doc = Document()
        doc.add(TextField("text", l, Field.Store.YES))
        writer.addDocument(doc)
    print "Indexed from %d docs in index" % (writer.numDocs())
    print "Closing index of %d docs..." % writer.numDocs()
    writer.close()

    accuracy = []
    process_query = open("Huawei_result/query.txt", "r")
    query_line = process_query.readlines()
    for n, one_query in enumerate(query_line):
        analyzer = StandardAnalyzer()
        # reader = IndexReader.open(SimpleFSDirectory(Paths.get('index')))
        searcher = IndexSearcher(DirectoryReader.open(directory))
        # searcher = IndexSearcher(reader)
        query = QueryParser("text", analyzer).parse(one_query)
        hits = searcher.search(query, MAX)
        # print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query)
        # print "The groundtruth document is:", doc_line[n]
        candidate_doc = []
        for hit in hits.scoreDocs:
            # print hit.score, hit.doc, hit.toString()
            doc = searcher.doc(hit.doc)
            # print doc.get("text").encode("utf-8")
            candidate_doc.append(doc.get("text"))

        choices = process.extract(unicode(doc_line[n]), candidate_doc)
        flag = 0
        for i in range(len(choices)):
            if choices[i][1] >= 89:
                flag = 1
        if flag == 1:
            accuracy.append(1)
        else:
            accuracy.append(0)

    final_accuracy = float(sum(accuracy)) / float(len(accuracy))

    print "the final accuracy is:", final_accuracy
예제 #6
0
    def testOverrideBooleanQuery(self):
        class TestQueryParser(BooleanTestMixin, PythonMultiFieldQueryParser):
            def getFieldQuery_quoted(_self, field, queryText, quoted):
                return super(TestQueryParser,
                             _self).getFieldQuery_quoted_super(
                                 field, queryText, quoted)

        qp = TestQueryParser(['one', 'two'], StandardAnalyzer())
        q = qp.parse("foo bar", ['one', 'two'],
                     [BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD],
                     StandardAnalyzer())
        self.assertEqual(str(q), "(one:foo one:bar) (two:foo two:bar)")
예제 #7
0
def retrieve(command):
    try:
        lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    except ValueError:
        print "JVM running."

    print 'lucene', lucene.VERSION
    base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
    directory = SimpleFSDirectory(Paths.get(os.path.join(base_dir, INDEX_DIR)))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = StandardAnalyzer()

    # to convert to AND query
    command = re.sub(r' ', r' +', command)
    command = "+" + command

    print "Searching for:", command
    query = QueryParser("contents", analyzer).parse(command)
    print query
    scoreDocs = searcher.search(query, 500).scoreDocs
    print "%s total matching documents." % len(scoreDocs)

    retrieved_docs = []
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        retrieved_docs.append(os.path.join(doc.get("path"), doc.get("name")))

    del searcher
    return retrieved_docs
예제 #8
0
    def create_index(self, index_folder, docs_path, add_terms=False):
        os.mkdir(index_folder)

        self.t1 = FieldType()
        self.t1.setStored(True)
        self.t1.setIndexOptions(IndexOptions.DOCS)

        self.t2 = FieldType()
        self.t2.setStored(False)
        self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        self.t3 = FieldType()
        self.t3.setStored(True)
        self.t3.setIndexOptions(IndexOptions.NONE)

        fsDir = MMapDirectory(Paths.get(index_folder))
        writerConfig = IndexWriterConfig(StandardAnalyzer())
        self.writer = IndexWriter(fsDir, writerConfig)
        print "%d docs in index" % self.writer.numDocs()
        print "Indexing documents..."

        doc_id = 0

        import corpus_hdf5
        corpus = corpus_hdf5.CorpusHDF5(docs_path)
        for txt in corpus.get_text_iter():
            title = corpus.get_article_title(doc_id)
            self.add_doc(doc_id, title, txt, add_terms)
            if doc_id % 1000 == 0:
                print 'indexing doc', doc_id
            doc_id += 1

        print "Index of %d docs..." % self.writer.numDocs()
        self.writer.close()
예제 #9
0
def main():
    INDEX_DIR = "indexes"
    try:
        print "Indexing..."
        indexDir = File("/Users/Raphael/Downloads/stackoverflow1107")

        #writer = IndexWriter(SimpleFSDirectory(indexDir), StandardAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED)
        analyzer = PorterAnalyzer(StandardAnalyzer(Version.LUCENE_CURRENT))
        a = {
            "typed_method_call": KeywordAnalyzer(),
            "extends": KeywordAnalyzer(),
            "used_classes": KeywordAnalyzer(),
            "methods": KeywordAnalyzer(),
            "class_instance_creation": KeywordAnalyzer(),
            "methods_called": KeywordAnalyzer(),
            "view_count": KeywordAnalyzer(),
            "code_hints": JavaCodeAnalyzer()
        }
        wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a)
        config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer)
        writer = IndexWriter(SimpleFSDirectory(indexDir), config)

        index_code_snippet(writer)

        writer.commit()
        writer.close()
        print "Done"
    except CorruptIndexException as e:  #when index is corrupt
        e.printStackTrace()
    except LockObtainFailedException as e:  #when other writer is using the index
        e.printStackTrace()
    except IOException as e:  #when directory can't be read/written
        e.printStackTrace()
    except SQLException as e:  #when Database error occurs
        e.printStackTrace()
예제 #10
0
    def __init__(self, index_store_path):

        store = NIOFSDirectory(Paths.get(index_store_path))
        analyzer = StandardAnalyzer()
        config = IndexWriterConfig(analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
        self.writer = IndexWriter(store, config)
예제 #11
0
    def setUp(self):
        super(PyLuceneThreadTestCase, self).setUp()

        self.classLoader = Thread.currentThread().getContextClassLoader()

        writer = self.getWriter(analyzer=StandardAnalyzer())

        doc1 = Document()
        doc2 = Document()
        doc3 = Document()
        doc4 = Document()
        doc1.add(Field("field", "one", TextField.TYPE_STORED))
        doc2.add(Field("field", "two", TextField.TYPE_STORED))
        doc3.add(Field("field", "three", TextField.TYPE_STORED))
        doc4.add(Field("field", "one", TextField.TYPE_STORED))

        writer.addDocument(doc1)
        writer.addDocument(doc2)
        writer.addDocument(doc3)
        writer.addDocument(doc4)
        writer.commit()
        writer.close()

        self.testData = [('one', 2), ('two', 1), ('three', 1),
                         ('five', 0)] * 500
        self.lock = threading.Lock()
        self.totalQueries = 0
예제 #12
0
    def testGiga(self):

        w = self.getWriter(analyzer=StandardAnalyzer())

        self._addDoc("Lucene in Action", w)
        self._addDoc("Lucene for Dummies", w)

        self._addDoc("Giga byte", w)

        self._addDoc("ManagingGigabytesManagingGigabyte", w)
        self._addDoc("ManagingGigabytesManagingGigabytes", w)

        self._addDoc("The Art of Computer Science", w)
        self._addDoc("J. K. Rowling", w)
        self._addDoc("JK Rowling", w)
        self._addDoc("Joanne K Roling", w)
        self._addDoc("Bruce Willis", w)
        self._addDoc("Willis bruce", w)
        self._addDoc("Brute willis", w)
        self._addDoc("B. willis", w)

        r = w.getReader()
        w.close()

        q = FuzzyQuery(Term("field", "giga"), 0)

        searcher = self.getSearcher(reader=r)
        hits = searcher.search(q, 10).scoreDocs

        self.assertEqual(1, len(hits))
        self.assertEqual("Giga byte", searcher.doc(hits[0].doc).get("field"))
예제 #13
0
 def __init__(self, index_path, query=None):
     self.index_path = index_path
     self.reader = None
     self.query = query
     self.porter_analyzer = PorterAnalyzer(
         StandardAnalyzer(Version.LUCENE_CURRENT))
     self.load_index()
예제 #14
0
    def __init__(self, index_path, method, logger=None, use_default_similarity=False):
        self.index_path=index_path
        directory = SimpleFSDirectory(File(self.index_path))
        self.analyzer = StandardAnalyzer(LuceneVersion.LUCENE_CURRENT)
        self.reader=DirectoryReader.open(directory)
        self.searcher = IndexSearcher(self.reader)

        # uncomment one of these lines to change the type of parser, query and weight used
        if use_default_similarity:
            self.query_parser=QueryParser
        else:
            self.query_parser=FieldAgnosticQueryParser

        if use_default_similarity:
            similarity=DefaultSimilarity()
            self.useExplainQuery=False
        else:
            similarity=FieldAgnosticSimilarity()
            self.useExplainQuery=True
        # by default, FieldAgnosticSimilarity uses coord factor, can be disabled
##        similarity.useCoord=False

        self.searcher.setSimilarity(similarity)
        self.method=method # never used?
        self.logger=logger
예제 #15
0
def run_music(ID):
    STORE_DIR = "index"
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    query = QueryParser(Version.LUCENE_CURRENT, "id", analyzer).parse(ID)
    scoreDocs = searcher.search(query, 1).scoreDocs

    try:
        scoreDoc = scoreDocs[0]
    except:
        return None
    doc = searcher.doc(scoreDoc.doc)

    item = []
    item.append(doc.get("song_title").encode('utf-8'))
    item.append(doc.get('song_url'))
    item.append(doc.get("singer").encode('utf-8'))
    item.append(doc.get("album").encode('utf-8'))
    item.append(doc.get("album_pic"))
    item.append(doc.get("album_genre").encode('utf-8'))
    item.append(doc.get("lyrics").encode('utf-8'))

    sim_str = doc.get("similar").encode('utf-8')
    sim_list = sim_str.split('+')
    for i in range(3):
        sim_list[i] = sim_list[i].split('*')
    item.append(sim_list)

    del searcher

    return item
예제 #16
0
    def testCompressionTools(self):

        bytes = JArray('byte')(self.binaryValCompressed)
        binaryFldCompressed = StoredField("binaryCompressed",
                                          CompressionTools.compress(bytes))
        stringFldCompressed = StoredField(
            "stringCompressed",
            CompressionTools.compressString(self.binaryValCompressed))

        doc = Document()
        doc.add(binaryFldCompressed)
        doc.add(stringFldCompressed)

        # add the doc to a ram index
        writer = self.getWriter(analyzer=StandardAnalyzer())
        writer.addDocument(doc)
        writer.close()

        # open a reader and fetch the document
        reader = self.getReader()
        docFromReader = reader.document(0)
        self.assertTrue(docFromReader is not None)

        # fetch the binary compressed field and compare it's content with
        # the original one
        bytes = CompressionTools.decompress(
            docFromReader.getBinaryValue("binaryCompressed"))
        binaryFldCompressedTest = bytes.string_
        self.assertEqual(binaryFldCompressedTest, self.binaryValCompressed)
        self.assertEqual(
            CompressionTools.decompressString(
                docFromReader.getBinaryValue("stringCompressed")),
            self.binaryValCompressed)

        reader.close()
예제 #17
0
 def __init__(self, db_path):
     directory = SimpleFSDirectory(File(db_path))
     reader = DirectoryReader.open(directory)
     self.searcher = IndexSearcher(reader)
     self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
     logger.info("Loaded DB from %s with %d documents: ", db_path,
                 reader.numDocs())
예제 #18
0
def build_index():

    lucene.initVM()

    # post_dir = current_app.config['LOCAL_REPO_PATH'] + '/_posts/'
    post_dir = '/Users/w3/data/github/codeif_backup'
    index_store_dir = current_app.config['INDEX_STORE_DIR']
    print post_dir
    print index_store_dir

    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    store = SimpleFSDirectory(File(index_store_dir))
    analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
    config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    writer = IndexWriter(store, config)

    indexDocs(post_dir, writer)
    ticker = Ticker()
    print 'commit index',
    threading.Thread(target=ticker.run).start()
    writer.commit()
    writer.close()
    ticker.tick = False
    print 'done'
예제 #19
0
def main():
	try:
		indicesDestination = File(dest_path)
		analyzer = KeywordAnalyzer()
		porter_analyzer = PorterAnalyzer(StandardAnalyzer(Version.LUCENE_CURRENT))
		a = {"code": porter_analyzer, "description": porter_analyzer, "typed_method_call": KeywordAnalyzer(),
			 "extends": KeywordAnalyzer(), "used_classes": KeywordAnalyzer(), "methods": KeywordAnalyzer(),
			 "class_instance_creation": KeywordAnalyzer(), "id": KeywordAnalyzer(), "literals": porter_analyzer}
		wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a)
		config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer)

		writer = IndexWriter(SimpleFSDirectory(indicesDestination), config)
		counter = Counter()
		generate_indices_from_benchmark(writer, counter)
		writer.close()

		print "All jobs are done.."
		print str(counter)

	except CorruptIndexException as e:		#when index is corrupt
			e.printStackTrace()
	except LockObtainFailedException as e:	#when other writer is using the index
			e.printStackTrace()
	except IOException as e:	#when directory can't be read/written
			e.printStackTrace()
예제 #20
0
def shourcut_retriever(keyword):
    '''查询器:在简介中查询'''
    global flag
    if flag:
        lucene.initVM()
    flag = False
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    reader = IndexReader.open(SimpleFSDirectory(File("index/")))
    searcher = IndexSearcher(reader)

    query = QueryParser(Version.LUCENE_4_10_1, "shortcut",
                        analyzer).parse(keyword)
    MAX = 20
    hits = searcher.search(query, MAX)

    print("Found %d document(s) that matched query '%s':" %
          (hits.totalHits, query))
    results = []
    for hit in hits.scoreDocs:
        print(hit.score, hit.doc, hit.toString())
        doc = searcher.doc(hit.doc)
        result = [doc.get('shortcut'), doc.get('url'), doc.get('name')]
        print(doc.get('url'))
        results.append(result)
    return results
예제 #21
0
	def __init__(self, indexDir):
		f = Paths.get(indexDir)
		self._dir = SimpleFSDirectory(f)
		analyzer = StandardAnalyzer()
		config = IndexWriterConfig(analyzer)
		config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
		self._writer = IndexWriter(self._dir, config)
예제 #22
0
    def testCachingWorks(self):
        writer = self.getWriter(analyzer=StandardAnalyzer(Version.LUCENE_CURRENT))
        writer.close()
        reader = SlowCompositeReaderWrapper.wrap(self.getReader())
        context = AtomicReaderContext.cast_(reader.getContext())

        class mockFilter(PythonFilter):
            def __init__(self):
                super(mockFilter, self).__init__()
                self._wasCalled = False
            def getDocIdSet(self, context, acceptDocs):
                self._wasCalled = True;
                return FixedBitSet(context.reader().maxDoc())
            def clear(self):
                self._wasCalled = False
            def wasCalled(self):
                return self._wasCalled

        filter = mockFilter()
        cacher = CachingWrapperFilter(filter)

        # first time, nested filter is called
        strongRef = cacher.getDocIdSet(context, context.reader().getLiveDocs())
        self.assert_(filter.wasCalled(), "first time")

        # second time, nested filter should not be called
        filter.clear()
        cacher.getDocIdSet(context, context.reader().getLiveDocs())
        self.assert_(not filter.wasCalled(), "second time")

        reader.close()
예제 #23
0
def run(command):
    global vm_env
    STORE_DIR = "index"
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    query = QueryParser(Version.LUCENE_CURRENT, "contents",
                        analyzer).parse(command)
    scoreDocs = searcher.search(query, 10).scoreDocs
    #print "%s total matching documents." % len(scoreDocs)
    res = []
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        tmp = []
        tmp.append([doc.get('name1'), doc.get('name2')])
        tmp.append(doc.get("homepage"))
        tmp.append(doc.get("intro"))
        tmp.append(doc.get('logo'))
        a = doc.get('goods')
        a = a.split('\n')
        for i in a:
            tmp.append(i)
        res.append(tmp)

    return command, res
예제 #24
0
def create_index(storage, paths):
    lucene.initVM()
    indexDir = SimpleFSDirectory(File(storage))
    stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
    for s in stopwords:
        stops.add(s)
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
    writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
    writer = IndexWriter(indexDir, writerConfig)

    print "%d docs in index" % writer.numDocs()
    print "Reading Documents"

    import os
    for path in paths:
        for filen in os.listdir(path):
            text = sent_tokenize(get_data_from_file(path + filen))
            total_sent = len(text)
            for i in range(0, total_sent, 3):
                doc = Document()
                a = i - 5 if i - 5 > 0 else 0
                sentence = ' '.join(text[a:i + 5])
                doc.add(
                    Field("text", sentence, Field.Store.YES,
                          Field.Index.ANALYZED))
                writer.addDocument(doc)
            print("Done %s" % (path + filen))
            print "Indexed (%d docs in index)" % (writer.numDocs())
    print "Closing index of %d docs..." % writer.numDocs()
    writer.close()
예제 #25
0
    def setUp(self):
        super(Test_Bug1763, self).setUp()

        self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        self.d1 = RAMDirectory()
        self.d2 = RAMDirectory()

        w1, w2 = [
            self.getWriter(directory=d, analyzer=self.analyzer)
            for d in [self.d1, self.d2]
        ]
        doc1 = Document()
        doc2 = Document()
        doc1.add(
            Field("all", "blah blah double blah Gesundheit",
                  TextField.TYPE_NOT_STORED))
        doc1.add(Field('id', '1', StoredField.TYPE))
        doc2.add(
            Field("all", "a quick brown test ran over the lazy data",
                  TextField.TYPE_NOT_STORED))
        doc2.add(Field('id', '2', StoredField.TYPE))
        w1.addDocument(doc1)
        w2.addDocument(doc2)
        for w in [w1, w2]:
            w.close()
예제 #26
0
    def __init__(self, index_dir, mode, date_format='%Y-%m-%dT%H:%M:%S'):
        """Constructor of Indexer.

        Parameters
        ----------
        index_dir : string
            The location of lucene index
        mode : string
            The mode when opening lucene index. Available values are:
                'create', open new index and overwriting over index,
                'append', open existed index and append.
                'create_or_append', if `index_dir` exists, 'append',
                else 'create'
        date_format : string
            We save datetime field as string, `date_format` specify how to
            format datetime into string.
        """
        # self.store = FSDirectory.open(File(index_dir))
        self.store = FSDirectory.open(Paths.get(index_dir))
        # self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        self.analyzer = StandardAnalyzer()
        # self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer)
        self.config = IndexWriterConfig(self.analyzer)
        self.mode = mode
        self.date_format = date_format
        if mode == 'create_or_append':
            self.config.setOpenMode(
                IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
        elif mode == 'create':
            self.config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        elif mode == 'append':
            self.config.setOpenMode(IndexWriterConfig.OpenMode.APPEND)
        else:
            raise ValueError('Invalid mode %s', mode)
        self.writer = IndexWriter(self.store, self.config)
예제 #27
0
def main():
    LUCENE_INDEX_DIR = 'mmapDirectory/trec_v15_wikipedia_stemmed_v2'
    try:
        lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        lucene_vm_init = True
    except:
        print('JavaVM already running')

    is_index_Exist = os.path.exists(LUCENE_INDEX_DIR)
    # specify index path
    index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR))

    # configure search engine
    analyzer = StandardAnalyzer()
    config = IndexWriterConfig(analyzer)
    #config=config.setRAMBufferSizeMB(1024.0)  # experimental setting !!
    # write data to index

    if not is_index_Exist:
        #if True:
        print('begin backup code files')
        system_flag = platform.system()
        if system_flag == 'Windows':
            os.system('robocopy %s %s\code_files *.py' %
                      (r'%cd%', LUCENE_INDEX_DIR))
        else:
            os.system('mkdir %s/code_files' % (LUCENE_INDEX_DIR))
            os.system('cp *.py %s/code_files' % (LUCENE_INDEX_DIR))

        w = IndexWriter(index_mm, config)
        makeIndex(w)
        w.close()
    else:
        print('index already exists, stop indexing')
예제 #28
0
def build_index(document_path, dir_path):
    lucene.initVM()
    index_dir = SimpleFSDirectory(Paths.get(dir_path))
    analyzer = StandardAnalyzer()
    config = IndexWriterConfig(analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    index_writer = IndexWriter(index_dir, config)

    t1 = FieldType()
    t1.setStored(True)
    t1.setTokenized(True)
    t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

    t2 = FieldType()
    t2.setStored(True)
    t2.setTokenized(False)

    with open(document_path) as input_file:
        for line in input_file:
            segs = line.strip().split(" ")
            music_path, music_tags = segs[0], segs[1].split(",")

            document = Document()
            document.add(Field("content", " ".join(music_tags), t1))
            document.add(Field("url", music_path, t2))
            index_writer.addDocument(document)

    index_writer.close()
    def retrieve_sents(self):

        indexDir = self.indexDir
        query = self.query

        sent_ind_list = []
        # template = CustomTemplate(format)
        fsDir = SimpleFSDirectory(Paths.get(indexDir))
        # print indexDir
        searcher = IndexSearcher(DirectoryReader.open(fsDir))

        analyzer = StandardAnalyzer()
        parser = QueryParser("contents", analyzer)
        parser.setDefaultOperator(QueryParser.Operator.OR)
        query = parser.parse(query)
        # print query
        start = datetime.now()
        scoreDocs = searcher.search(query, 50).scoreDocs
        duration = datetime.now() - start
        # print query
        if self.stats:
            print >> sys.stderr, "Found %d sentences (in %s) that matched query '%s':" % (
                len(scoreDocs), duration, query)

        for scoreDoc in scoreDocs:
            # print scoreDoc.doc
            # doc = searcher.doc(scoreDoc.doc)
            sent_ind_list.append(scoreDoc.doc)

        return sent_ind_list
예제 #30
0
 def getWriter(self, store, analyzer=None, create=False):
     if analyzer is None:
         analyzer = StandardAnalyzer()
     config = IndexWriterConfig(analyzer)
     config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
     writer = IndexWriter(store, config)
     return writer