def wikipedia_indexer(storage, wikipedia_file) : lucene.initVM() indexDir = SimpleFSDirectory(File(storage)) stops = CharArraySet(Version.LUCENE_4_10_1, 0, True) for s in stopwords : stops.add(s) analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) writer = IndexWriter(indexDir, writerConfig) print "%d docs in index" % writer.numDocs() print "Reading Documents" f = open(wikipedia_file) for i, line in enumerate(f) : text = line.strip().decode('utf-8').split('\t') title = text[0] if 'disambigu' in text[0] or len(text) < 2: continue text = text[1] doc = Document() doc.add(Field("num", str(i), Field.Store.YES, Field.Index.NO)) doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("text", text, Field.Store.NO, Field.Index.ANALYZED)) writer.addDocument(doc) if writer.numDocs() % 1000 == 0 : print "Indexed (%d docs in index) Last %d" % (writer.numDocs(), i) print "Closing index of %d docs..." % writer.numDocs() writer.close()
def irsolver(data_file, index) : from questions import get_input_data lucene.initVM() stops = CharArraySet(Version.LUCENE_4_10_1, 0, True) for s in stopwords : stops.add(s) analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops) reader = IndexReader.open(SimpleFSDirectory(File(index))) searcher = IndexSearcher(reader) pred = [] mapp = { 1 : 'A', 2 : 'B', 3 : 'C', 4 : 'D'} idx, ques, ans = get_input_data(data_file) for acm, (idq, q, a) in enumerate(zip(idx, ques, ans)) : max_score = -1000000 best_ans = 'A' for i, ai in enumerate(a): sc = query(q, ai, analyzer, searcher) print(acm, i, sc) if sc > max_score : max_score = sc best_ans = mapp[i+1] pred.append(best_ans) return idx, pred
def getTopDocumentsWithExpansion(self, query, expTerms, limit, sfield, dfield ): print expTerms query = query + ' ' + ' '.join('{0}^{1}'.format(x[0], round(x[1], 2)) for x in expTerms) sSet = CharArraySet(Version.LUCENE_CURRENT, 0, True) for entry in expTerms: sSet.add(entry[0]) analyzer = EnglishAnalyzer(Version.LUCENE_CURRENT, self.stopSet, sSet) queryObj = QueryParser(Version.LUCENE_CURRENT, sfield, analyzer).parse(query) scoreDocs = self.searcher.search(queryObj, limit).scoreDocs print '%s total matching documents.' % len(scoreDocs), queryObj self.results = scoreDocs rresults = [] i = 0 for scoreDoc in scoreDocs: doc = self.searcher.doc(scoreDoc.doc) #rresults.append(doc.get(dfield));#,scoreDoc.score)) rresults.append((doc.get(dfield), scoreDoc.score)) i += 1 if i == limit: break return rresults
def create_index(storage, paths) : lucene.initVM() indexDir = SimpleFSDirectory(File(storage)) stops = CharArraySet(Version.LUCENE_4_10_1, 0, True) for s in stopwords : stops.add(s) analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) writer = IndexWriter(indexDir, writerConfig) print "%d docs in index" % writer.numDocs() print "Reading Documents" import os for path in paths : for filen in os.listdir(path) : text = sent_tokenize(get_data_from_file(path + filen)) total_sent = len(text) for i in range(0, total_sent, 3) : doc = Document() a = i-5 if i-5 > 0 else 0 sentence = ' '.join(text[a:i+5]) doc.add(Field("text", sentence, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) print("Done %s" % (path+filen)) print "Indexed (%d docs in index)" % (writer.numDocs()) print "Closing index of %d docs..." % writer.numDocs() writer.close()
def initializeAnalyzer(self): #self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT,JavaSet(stopSet)) sSet = CharArraySet(Version.LUCENE_CURRENT, 0, True) for entry in stopSet: sSet.add(entry) self.stopSet = sSet #self.analyzer = EnglishAnalyzer(Version.LUCENE_CURRENT,sSet) self.analyzer = EnglishAnalyzer(Version.LUCENE_CURRENT)
def get_wiki_docids(data_file, wikipedia_index): from questions import get_input_data data = get_input_data(data_file) lucene.initVM() stops = CharArraySet(Version.LUCENE_4_10_1, 0, True) for s in stopwords: stops.add(s) analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops) reader = IndexReader.open(SimpleFSDirectory(File(wikipedia_index))) searcher = IndexSearcher(reader) generate_docids(data, data_file, analyzer, searcher)
def create_index(index) : indexDir = SimpleFSDirectory(File(index)) stops = CharArraySet(Version.LUCENE_4_10_1, 0, True) for s in stopwords : stops.add(s) analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) writer = IndexWriter(indexDir, writerConfig) print "%d docs in index" % writer.numDocs() print "Reading Documents" f = open('f:/nlp/data/questions/combine.txt') for line in f : line = get_data_from_text(line.decode('utf-8')) doc = Document() field = Field("text", line, Field.Store.YES, Field.Index.ANALYZED) field.setBoost(2.0) doc.add(field) writer.addDocument(doc) print "Indexed (%d docs in index)" % (writer.numDocs()) print "Closing index of %d docs..." % writer.numDocs() writer.close()
def internal_analyzer(self): java_stopwords = [ "public", "private", "protected", "interface", "abstract", "implements", "extends", "null", "new", "switch", "case", "default", "synchronized", "do", "if", "else", "break", "continue", "this", "assert", "for", "instanceof", "transient", "final", "static", "void", "catch", "try", "throws", "throw", "class", "finally", "return", "const", "native", "super", "while", "import", "package", "true", "false", "enum" ] all_stopwords = list( StandardAnalyzer(Version.LUCENE_CURRENT).getStopwordSet()) all_stopwords.extend(java_stopwords) stopwords = CharArraySet(Version.LUCENE_CURRENT, all_stopwords, True) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT, stopwords) #analyzer = KeywordAnalyzer() return analyzer
def main(indexDir, inputDir): """Creates a Lucene Index, and indexes every .json file it finds. It utilizes a stopwords.txt to filter out stop words""" lucene.initVM() logger.info("Loading stop words from stopwords.txt") f = open('stopwords.txt', 'r') stopwords = set([]) for line in f: stopwords.add(line.strip()) f.close() logger.debug('Stop words: %s' % str(stopwords)) temp = CharArraySet(Version.LUCENE_CURRENT, 1, True) for stopword in stopwords: temp.add(stopword) stopwords = temp # Create index logger.info("Creating Lucene index [%s]..." % indexDir) dir = SimpleFSDirectory(File(indexDir)) analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopwords) writerConfig = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) writer = IndexWriter(dir, writerConfig) logger.info("Currently there are %d documents in the index..." % writer.numDocs()) # Index documents onlyfiles = [ f for f in listdir(inputDir) if isfile(join(inputDir, f)) and f.endswith('.json') ] for f in onlyfiles: try: journal_code = f.split('.')[0] f = join(inputDir, f) json_data = open(f) data = json.load(json_data) for entry in data: doc = Document() doc.add(Field("journal", journal_code, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("url", entry['url'], Field.Store.YES, Field.Index.NOT_ANALYZED )) doc.add(Field("date", entry['date'], Field.Store.YES, Field.Index.NOT_ANALYZED )) doc.add(Field("title", entry['title'], Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) json_data.close() except (IOError) as v: try: (code, message) = v except: code = 0 message = v logger.error("I/O Error: " + str(message) + " (" + str(code) + ")") logger.info("Indexed lines from stdin (%d documents in index)" % writer.numDocs()) # Wrap it up #logger.info("About to optimize index of %d documents..." % writer.numDocs()) #writer.optimize() #logger.info("...done optimizing index of %d documents" % writer.numDocs()) logger.info("Closing index of %d documents..." % writer.numDocs()) writer.close() reader = IndexReader.open(dir) with open('all.csv', 'wb') as csvfile: csvwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL) for i in xrange(0, reader.numDocs()): doc = reader.document(i) csvwriter.writerow([doc.get('journal'), doc.get('date'), doc.get('url').encode('utf8'), \ doc.get('title').strip().replace(',', '\,').encode('utf8')])
from org.apache.lucene.analysis.util import CharArraySet if __name__ == "__main__": if len(sys.argv) != 4: print sys.argv[0] + ' <documentsDirectory> <stopWords> <indexDirectory>' exit() documentsDirectory = sys.argv[1] stopWords = sys.argv[2] indexDirectory = sys.argv[3] lucene.initVM() exclusionSet = CharArraySet(0, True) f = open(stopWords, 'r') while 1: line = f.readline() if not line: break exclusionSet.add(line.strip()) f.close() indexDir = SimpleFSDirectory(File(indexDirectory)) writerConfig = IndexWriterConfig(Version.LUCENE_CURRENT, SpanishAnalyzer(exclusionSet)) writer = IndexWriter(indexDir, writerConfig) totalDocs = len(glob.glob(documentsDirectory + '/*.xml'))