Exemplo n.º 1
0
    def __init__(self, index_store_path):

        store = NIOFSDirectory(Paths.get(index_store_path))
        analyzer = StandardAnalyzer()
        config = IndexWriterConfig(analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
        self.writer = IndexWriter(store, config)
Exemplo n.º 2
0
    def run(self):
        print "Booting lucene driver worker...."
        lucene.initVM()

        self.fieldType1 = FieldType()
        self.fieldType1.setIndexed(True)
        self.fieldType1.setStored(False)
        self.fieldType1.setTokenized(True)

        self.fieldType2 = FieldType()
        self.fieldType2.setIndexed(True)
        self.fieldType2.setStored(True)
        self.fieldType2.setTokenized(False)

        while(True):
            data = self.queue.get()
            da = data[1]
            response = None
            try:
                self.fil = File(da['data']['indexdir'])
                self.d = NIOFSDirectory(self.fil)
                self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
                self.conf = IndexWriterConfig(
                    Version.LUCENE_CURRENT,
                    self.analyzer)

                response = getattr(self, da['action'])(da['data'])
                self.d.close()
            except Exception as e:
                print e
            if response is None:
                response = {}

            self.ret[data[0]] = response
Exemplo n.º 3
0
    def __init__(self, corpusPath, storeDir):

        if not os.path.exists(storeDir):
            os.mkdir(storeDir)

        store = NIOFSDirectory(Paths.get(storeDir))
        analyzer = StandardAnalyzer()
        config = IndexWriterConfig(analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(store, config)

        self.indexDocs(corpusPath, writer)
        ticker = Ticker()
        print('commit index')
        threading.Thread(target=ticker.run).start()
        writer.commit()
        writer.close()
        ticker.tick = False
        print('done')
Exemplo n.º 4
0
        i = doc.getValues('bestbets')
        self.bets = []
        if i:
            self.bets = i[0].split()
            
def do_query(property, qstring, limit = 10):
    query = BooleanQuery()
    stream = analyzer.tokenStream(property, StringReader(qstring))
    stream.reset()
    attr = stream.getAttribute(CharTermAttribute)

    while stream.incrementToken():
        term = attr.toString()
        termQuery = TermQuery(Term(property, term))
        query.add(termQuery, Occur.SHOULD)

    hits = searcher.search(query, None, limit).scoreDocs
    return [Document(searcher.doc(hit.doc)) for hit in hits]

path = 'lucene-ix'
analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
directory = NIOFSDirectory.open(File(path))
reader = DirectoryReader.open(directory)
searcher = IndexSearcher(reader)

if __name__ == '__main__':
    value = ' '.join(sys.argv[1 : ])
    for doc in do_query('indicators', value):
        print doc.title
Exemplo n.º 5
0
Must be run with Jython.
'''

import movielib

from java.io import File
from org.apache.lucene.util import Version
from org.apache.lucene.store import NIOFSDirectory
from org.apache.lucene.index import IndexWriterConfig, IndexWriter
from org.apache.lucene.document import Field, Document
from org.apache.lucene.analysis.standard import StandardAnalyzer

# --- OPEN SEARCH INDEX
path = "lucene-ix"
analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
directory = NIOFSDirectory.open(File(path))
cfg = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
cfg.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
iwriter = IndexWriter(directory, cfg)

# --- START INDEXING
print 'Loading metadata'
movies = {}
for (movieid, title, cats) in movielib.get_movies():
    movies[movieid] = (title, cats)

print 'Loading best bets'
bets = {}
for line in open('best-bets.txt'):
    row = line.split()
    id = row[0]
Exemplo n.º 6
0
class Worker(Process):

    """This class represents the lucene worker object, which will run
    concurrently with other workers, and contains the actual lucene logic."""

    def __init__(self, config, queue, ret):
        Process.__init__(self)
        self.config = config
        self.connection = None
        self.channel = None
        self.queue = queue
        self.ret = ret

    def run(self):
        print "Booting lucene driver worker...."
        lucene.initVM()

        self.fieldType1 = FieldType()
        self.fieldType1.setIndexed(True)
        self.fieldType1.setStored(False)
        self.fieldType1.setTokenized(True)

        self.fieldType2 = FieldType()
        self.fieldType2.setIndexed(True)
        self.fieldType2.setStored(True)
        self.fieldType2.setTokenized(False)

        while(True):
            data = self.queue.get()
            da = data[1]
            response = None
            try:
                self.fil = File(da['data']['indexdir'])
                self.d = NIOFSDirectory(self.fil)
                self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
                self.conf = IndexWriterConfig(
                    Version.LUCENE_CURRENT,
                    self.analyzer)

                response = getattr(self, da['action'])(da['data'])
                self.d.close()
            except Exception as e:
                print e
            if response is None:
                response = {}

            self.ret[data[0]] = response

    def rebuildIndex(self, data):
        writer = IndexWriter(
            self.d, self.conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE))

        for record in data['records']:
            doc = self.buildDocument(data['fields'], record)
            writer.addDocument(doc)

        writer.commit()
        writer.close()

    def buildDocument(self, fields, record):
        doc = Document()
        doc.add(
            Field("id",
                  record["_id"],
                  self.fieldType2))
        for field in fields:
            if isinstance(record[field], dict):
                self.dictToFields(doc, record[field])
            else:
                doc.add(
                    Field(field,
                          record[field],
                          self.fieldType1))

        return doc

    def dictToFields(self, doc, record):
        for key in record:
            if isinstance(record[key], dict):
                self.dictToFields(doc, record[key])
            else:
                doc.add(
                    Field(key,
                          record[key],
                          self.fieldType1))

    def index(self, data):
        writer = IndexWriter(
            self.d, self.conf)

        doc = self.buildDocument(data['fields'], data['record'])
        writer.addDocument(doc)

        writer.commit()
        writer.close()

    def updateindex(self, data):
        writer = IndexWriter(
            self.d, self.conf)

        doc = self.buildDocument(data['fields'], data['record'])
        writer.updateDocument(lucene.Term("_id", data['record']['_id']), doc)

        writer.optimize()
        writer.close()

    def removeindex(self, data):
        writer = IndexWriter(
            self.d, self.conf)

        writer.deleteDocuments(lucene.Term("_id", data['record']['_id']))

        writer.optimize()
        writer.close()

    def query(self, data):
        if self.fil.exists():
            searcher = IndexSearcher(DirectoryReader.open(self.d))
            query = QueryParser(
                Version.LUCENE_30,
                "id",
                self.analyzer).parse(
                data['query'])
            hits = searcher.search(query, 100000)

            results = {}

            results['totalHits'] = hits.totalHits
            results['hits'] = {}

            for hit in hits.scoreDocs:
                record = {}
                doc = searcher.doc(hit.doc)
                fields = doc.getFields()
                record['score'] = hit.score
                for field in fields:
                    if field.name() != "id":
                        record[field.name()] = field.stringValue()
                results['hits'][doc.get('id')] = record

            searcher.getIndexReader().close()
            return results
Exemplo n.º 7
0
                        help='qa data for evaluation',
                        default='/home/xwhan/data/nq/nq-dev.txt')
    parser.add_argument('--topk', type=int, default=500)
    args = parser.parse_args()

    qas = [json.loads(line) for line in open(args.qa_data).readlines()][:1000]
    questions = [
        _["question"][:-1] if _["question"].endswith("?") else _["question"]
        for _ in qas
    ]
    answers = [item["answer"] for item in qas]

    print("Loading Lucene Index ...")
    lucene.initVM(vmargs=['-Djava.aws.headless=true'])
    analyzer = StandardAnalyzer()
    searchDir = NIOFSDirectory(Paths.get(args.index_path))
    searcher = IndexSearcher(DirectoryReader.open(searchDir))

    # try tuning the hyperparameters of bm25
    for k1 in [0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2]:
        for b in [0.5, 0.6, 0.7, 0.8, 0.9]:

            print(f"Grid search.... k1: {k1}; b: {b}")

            searcher.setSimilarity(BM25Similarity(k1, b))

            parser = QueryParser('Context', analyzer)

            retrieved = []
            print("Searching ...")
            for q in tqdm(questions):