Exemplos de NIOFSDirectory em Python, exemplos de org.apache.lucene.store.NIOFSDirectory em Python

Exemplo n.º 1

0

Exibir arquivo

    def __init__(self, index_store_path):

        store = NIOFSDirectory(Paths.get(index_store_path))
        analyzer = StandardAnalyzer()
        config = IndexWriterConfig(analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
        self.writer = IndexWriter(store, config)

Exemplo n.º 2

0

Exibir arquivo

Arquivo: lucenedriver.py Projeto: bradleyjones/apiary

    def run(self):
        print "Booting lucene driver worker...."
        lucene.initVM()

        self.fieldType1 = FieldType()
        self.fieldType1.setIndexed(True)
        self.fieldType1.setStored(False)
        self.fieldType1.setTokenized(True)

        self.fieldType2 = FieldType()
        self.fieldType2.setIndexed(True)
        self.fieldType2.setStored(True)
        self.fieldType2.setTokenized(False)

        while(True):
            data = self.queue.get()
            da = data[1]
            response = None
            try:
                self.fil = File(da['data']['indexdir'])
                self.d = NIOFSDirectory(self.fil)
                self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
                self.conf = IndexWriterConfig(
                    Version.LUCENE_CURRENT,
                    self.analyzer)

                response = getattr(self, da['action'])(da['data'])
                self.d.close()
            except Exception as e:
                print e
            if response is None:
                response = {}

            self.ret[data[0]] = response

Exemplo n.º 3

0

Exibir arquivo

    def __init__(self, corpusPath, storeDir):

        if not os.path.exists(storeDir):
            os.mkdir(storeDir)

        store = NIOFSDirectory(Paths.get(storeDir))
        analyzer = StandardAnalyzer()
        config = IndexWriterConfig(analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(store, config)

        self.indexDocs(corpusPath, writer)
        ticker = Ticker()
        print('commit index')
        threading.Thread(target=ticker.run).start()
        writer.commit()
        writer.close()
        ticker.tick = False
        print('done')

Exemplo n.º 4

0

Exibir arquivo

Arquivo: search.py Projeto: JStrotmann/py-snippets

        i = doc.getValues('bestbets')
        self.bets = []
        if i:
            self.bets = i[0].split()
            
def do_query(property, qstring, limit = 10):
    query = BooleanQuery()
    stream = analyzer.tokenStream(property, StringReader(qstring))
    stream.reset()
    attr = stream.getAttribute(CharTermAttribute)

    while stream.incrementToken():
        term = attr.toString()
        termQuery = TermQuery(Term(property, term))
        query.add(termQuery, Occur.SHOULD)

    hits = searcher.search(query, None, limit).scoreDocs
    return [Document(searcher.doc(hit.doc)) for hit in hits]

path = 'lucene-ix'
analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
directory = NIOFSDirectory.open(File(path))
reader = DirectoryReader.open(directory)
searcher = IndexSearcher(reader)

if __name__ == '__main__':
    value = ' '.join(sys.argv[1 : ])
    for doc in do_query('indicators', value):
        print doc.title

Exemplo n.º 5

0

Exibir arquivo

Must be run with Jython.
'''

import movielib

from java.io import File
from org.apache.lucene.util import Version
from org.apache.lucene.store import NIOFSDirectory
from org.apache.lucene.index import IndexWriterConfig, IndexWriter
from org.apache.lucene.document import Field, Document
from org.apache.lucene.analysis.standard import StandardAnalyzer

# --- OPEN SEARCH INDEX
path = "lucene-ix"
analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
directory = NIOFSDirectory.open(File(path))
cfg = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
cfg.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
iwriter = IndexWriter(directory, cfg)

# --- START INDEXING
print 'Loading metadata'
movies = {}
for (movieid, title, cats) in movielib.get_movies():
    movies[movieid] = (title, cats)

print 'Loading best bets'
bets = {}
for line in open('best-bets.txt'):
    row = line.split()
    id = row[0]

Exemplo n.º 6

0

Exibir arquivo

Arquivo: lucenedriver.py Projeto: bradleyjones/apiary

class Worker(Process):

    """This class represents the lucene worker object, which will run
    concurrently with other workers, and contains the actual lucene logic."""

    def __init__(self, config, queue, ret):
        Process.__init__(self)
        self.config = config
        self.connection = None
        self.channel = None
        self.queue = queue
        self.ret = ret

    def run(self):
        print "Booting lucene driver worker...."
        lucene.initVM()

        self.fieldType1 = FieldType()
        self.fieldType1.setIndexed(True)
        self.fieldType1.setStored(False)
        self.fieldType1.setTokenized(True)

        self.fieldType2 = FieldType()
        self.fieldType2.setIndexed(True)
        self.fieldType2.setStored(True)
        self.fieldType2.setTokenized(False)

        while(True):
            data = self.queue.get()
            da = data[1]
            response = None
            try:
                self.fil = File(da['data']['indexdir'])
                self.d = NIOFSDirectory(self.fil)
                self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
                self.conf = IndexWriterConfig(
                    Version.LUCENE_CURRENT,
                    self.analyzer)

                response = getattr(self, da['action'])(da['data'])
                self.d.close()
            except Exception as e:
                print e
            if response is None:
                response = {}

            self.ret[data[0]] = response

    def rebuildIndex(self, data):
        writer = IndexWriter(
            self.d, self.conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE))

        for record in data['records']:
            doc = self.buildDocument(data['fields'], record)
            writer.addDocument(doc)

        writer.commit()
        writer.close()

    def buildDocument(self, fields, record):
        doc = Document()
        doc.add(
            Field("id",
                  record["_id"],
                  self.fieldType2))
        for field in fields:
            if isinstance(record[field], dict):
                self.dictToFields(doc, record[field])
            else:
                doc.add(
                    Field(field,
                          record[field],
                          self.fieldType1))

        return doc

    def dictToFields(self, doc, record):
        for key in record:
            if isinstance(record[key], dict):
                self.dictToFields(doc, record[key])
            else:
                doc.add(
                    Field(key,
                          record[key],
                          self.fieldType1))

    def index(self, data):
        writer = IndexWriter(
            self.d, self.conf)

        doc = self.buildDocument(data['fields'], data['record'])
        writer.addDocument(doc)

        writer.commit()
        writer.close()

    def updateindex(self, data):
        writer = IndexWriter(
            self.d, self.conf)

        doc = self.buildDocument(data['fields'], data['record'])
        writer.updateDocument(lucene.Term("_id", data['record']['_id']), doc)

        writer.optimize()
        writer.close()

    def removeindex(self, data):
        writer = IndexWriter(
            self.d, self.conf)

        writer.deleteDocuments(lucene.Term("_id", data['record']['_id']))

        writer.optimize()
        writer.close()

    def query(self, data):
        if self.fil.exists():
            searcher = IndexSearcher(DirectoryReader.open(self.d))
            query = QueryParser(
                Version.LUCENE_30,
                "id",
                self.analyzer).parse(
                data['query'])
            hits = searcher.search(query, 100000)

            results = {}

            results['totalHits'] = hits.totalHits
            results['hits'] = {}

            for hit in hits.scoreDocs:
                record = {}
                doc = searcher.doc(hit.doc)
                fields = doc.getFields()
                record['score'] = hit.score
                for field in fields:
                    if field.name() != "id":
                        record[field.name()] = field.stringValue()
                results['hits'][doc.get('id')] = record

            searcher.getIndexReader().close()
            return results

Exemplo n.º 7

0

Exibir arquivo

                        help='qa data for evaluation',
                        default='/home/xwhan/data/nq/nq-dev.txt')
    parser.add_argument('--topk', type=int, default=500)
    args = parser.parse_args()

    qas = [json.loads(line) for line in open(args.qa_data).readlines()][:1000]
    questions = [
        _["question"][:-1] if _["question"].endswith("?") else _["question"]
        for _ in qas
    ]
    answers = [item["answer"] for item in qas]

    print("Loading Lucene Index ...")
    lucene.initVM(vmargs=['-Djava.aws.headless=true'])
    analyzer = StandardAnalyzer()
    searchDir = NIOFSDirectory(Paths.get(args.index_path))
    searcher = IndexSearcher(DirectoryReader.open(searchDir))

    # try tuning the hyperparameters of bm25
    for k1 in [0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2]:
        for b in [0.5, 0.6, 0.7, 0.8, 0.9]:

            print(f"Grid search.... k1: {k1}; b: {b}")

            searcher.setSimilarity(BM25Similarity(k1, b))

            parser = QueryParser('Context', analyzer)

            retrieved = []
            print("Searching ...")
            for q in tqdm(questions):