def create_index(storage, paths): lucene.initVM() indexDir = SimpleFSDirectory(File(storage)) stops = CharArraySet(Version.LUCENE_4_10_1, 0, True) for s in stopwords: stops.add(s) analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) writer = IndexWriter(indexDir, writerConfig) print "%d docs in index" % writer.numDocs() print "Reading Documents" import os for path in paths: for filen in os.listdir(path): text = sent_tokenize(get_data_from_file(path + filen)) total_sent = len(text) for i in range(0, total_sent, 3): doc = Document() a = i - 5 if i - 5 > 0 else 0 sentence = ' '.join(text[a:i + 5]) doc.add( Field("text", sentence, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) print("Done %s" % (path + filen)) print "Indexed (%d docs in index)" % (writer.numDocs()) print "Closing index of %d docs..." % writer.numDocs() writer.close()
def wikipedia_indexer(storage, wikipedia_file): lucene.initVM() indexDir = SimpleFSDirectory(File(storage)) stops = CharArraySet(Version.LUCENE_4_10_1, 0, True) for s in stopwords: stops.add(s) analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) writer = IndexWriter(indexDir, writerConfig) print "%d docs in index" % writer.numDocs() print "Reading Documents" f = open(wikipedia_file) for i, line in enumerate(f): text = line.strip().decode('utf-8').split('\t') title = text[0] if 'disambigu' in text[0] or len(text) < 2: continue text = text[1] doc = Document() doc.add(Field("num", str(i), Field.Store.YES, Field.Index.NO)) doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("text", text, Field.Store.NO, Field.Index.ANALYZED)) writer.addDocument(doc) if writer.numDocs() % 1000 == 0: print "Indexed (%d docs in index) Last %d" % (writer.numDocs(), i) print "Closing index of %d docs..." % writer.numDocs() writer.close()
def irsolver(data_file, index): from questions import get_input_data lucene.initVM() stops = CharArraySet(Version.LUCENE_4_10_1, 0, True) for s in stopwords: stops.add(s) analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops) reader = IndexReader.open(SimpleFSDirectory(File(index))) searcher = IndexSearcher(reader) pred = [] mapp = {1: 'A', 2: 'B', 3: 'C', 4: 'D'} idx, ques, ans = get_input_data(data_file) for acm, (idq, q, a) in enumerate(zip(idx, ques, ans)): max_score = -1000000 best_ans = 'A' for i, ai in enumerate(a): sc = query(q, ai, analyzer, searcher) print(acm, i, sc) if sc > max_score: max_score = sc best_ans = mapp[i + 1] pred.append(best_ans) return idx, pred
def get_wiki_docids(data_file, wikipedia_index): from questions import get_input_data data = get_input_data(data_file) lucene.initVM() stops = CharArraySet(Version.LUCENE_4_10_1, 0, True) for s in stopwords: stops.add(s) analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops) reader = IndexReader.open(SimpleFSDirectory(File(wikipedia_index))) searcher = IndexSearcher(reader) generate_docids(data, data_file, analyzer, searcher)
def internal_analyzer(self): java_stopwords = [ "public", "private", "protected", "interface", "abstract", "implements", "extends", "null", "new", "switch", "case", "default", "synchronized", "do", "if", "else", "break", "continue", "this", "assert", "for", "instanceof", "transient", "final", "static", "void", "catch", "try", "throws", "throw", "class", "finally", "return", "const", "native", "super", "while", "import", "package", "true", "false", "enum" ] all_stopwords = list( StandardAnalyzer(Version.LUCENE_CURRENT).getStopwordSet()) all_stopwords.extend(java_stopwords) stopwords = CharArraySet(Version.LUCENE_CURRENT, all_stopwords, True) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT, stopwords) return analyzer
def create_index(index) : indexDir = SimpleFSDirectory(File(index)) stops = CharArraySet(Version.LUCENE_4_10_1, 0, True) for s in stopwords : stops.add(s) analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) writer = IndexWriter(indexDir, writerConfig) print "%d docs in index" % writer.numDocs() print "Reading Documents" f = open('f:/nlp/data/questions/combine.txt') for line in f : line = get_data_from_text(line.decode('utf-8')) doc = Document() field = Field("text", line, Field.Store.YES, Field.Index.ANALYZED) field.setBoost(2.0) doc.add(field) writer.addDocument(doc) print "Indexed (%d docs in index)" % (writer.numDocs()) print "Closing index of %d docs..." % writer.numDocs() writer.close()
from org.apache.lucene.analysis.util import CharArraySet if __name__ == "__main__": if len(sys.argv) != 4: print sys.argv[0] + ' <documentsDirectory> <stopWords> <indexDirectory>' exit() documentsDirectory = sys.argv[1] stopWords = sys.argv[2] indexDirectory = sys.argv[3] lucene.initVM() exclusionSet = CharArraySet(0, True) f = open(stopWords, 'r') while 1: line = f.readline() if not line: break exclusionSet.add(line.strip()) f.close() indexDir = SimpleFSDirectory(File(indexDirectory)) writerConfig = IndexWriterConfig(Version.LUCENE_CURRENT, SpanishAnalyzer(exclusionSet)) writer = IndexWriter(indexDir, writerConfig) totalDocs = len(glob.glob(documentsDirectory + '/*.xml'))