示例#1
0
def wikipedia_indexer(storage, wikipedia_file) :
	lucene.initVM()
	indexDir = SimpleFSDirectory(File(storage))
	stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
	for s in stopwords :
		stops.add(s)
	analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
	writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
	writer = IndexWriter(indexDir, writerConfig)

	print "%d docs in index" % writer.numDocs()
	print "Reading Documents"

	f = open(wikipedia_file)

	for i, line in enumerate(f) :
		text = line.strip().decode('utf-8').split('\t')
		title = text[0]
		if 'disambigu' in text[0] or len(text) < 2:
			continue
		text = text[1]
		doc = Document()
		doc.add(Field("num", str(i), Field.Store.YES, Field.Index.NO))
		doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED))
		doc.add(Field("text", text, Field.Store.NO, Field.Index.ANALYZED))
		writer.addDocument(doc)
		if writer.numDocs() % 1000 == 0 :
			print "Indexed (%d docs in index) Last %d" % (writer.numDocs(), i)
		
	print "Closing index of %d docs..." % writer.numDocs()
	writer.close()	
示例#2
0
def irsolver(data_file, index) :
	from questions import get_input_data
	lucene.initVM()
	stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
	for s in stopwords :
		stops.add(s)
	analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
	reader = IndexReader.open(SimpleFSDirectory(File(index)))
	searcher = IndexSearcher(reader)
	pred = []
	mapp = { 1 : 'A', 2 : 'B', 3 : 'C', 4 : 'D'}

	idx, ques, ans = get_input_data(data_file)
	for acm, (idq, q, a) in enumerate(zip(idx, ques, ans)) :
		max_score = -1000000
		best_ans = 'A'
		for i, ai in enumerate(a):
			sc = query(q, ai, analyzer, searcher)
			print(acm, i, sc)
			if sc > max_score :
				max_score = sc
				best_ans = mapp[i+1]
		pred.append(best_ans)

	return idx, pred
示例#3
0
  def getTopDocumentsWithExpansion(self, query, expTerms, limit, sfield, dfield
                                 ):
    print expTerms
    query = query + ' ' + ' '.join('{0}^{1}'.format(x[0], round(x[1], 2))
                                   for x in expTerms)
    sSet = CharArraySet(Version.LUCENE_CURRENT, 0, True)
    for entry in expTerms:
      sSet.add(entry[0])

    analyzer = EnglishAnalyzer(Version.LUCENE_CURRENT, self.stopSet, sSet)

    queryObj = QueryParser(Version.LUCENE_CURRENT, sfield,
                           analyzer).parse(query)
    scoreDocs = self.searcher.search(queryObj, limit).scoreDocs
    print '%s total matching documents.' % len(scoreDocs), queryObj
    self.results = scoreDocs
    rresults = []
    i = 0

    for scoreDoc in scoreDocs:
      doc = self.searcher.doc(scoreDoc.doc)
      #rresults.append(doc.get(dfield));#,scoreDoc.score))
      rresults.append((doc.get(dfield), scoreDoc.score))

      i += 1
      if i == limit:
        break
    return rresults
示例#4
0
def create_index(storage, paths) :
	lucene.initVM()
	indexDir = SimpleFSDirectory(File(storage))
	stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
	for s in stopwords :
		stops.add(s)
	analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
	writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
	writer = IndexWriter(indexDir, writerConfig)

	print "%d docs in index" % writer.numDocs()
	print "Reading Documents"

	import os
	for path in paths :
		for filen in os.listdir(path) :
			text = sent_tokenize(get_data_from_file(path + filen))
			total_sent = len(text)
			for i in range(0, total_sent, 3) :
				doc = Document()
				a = i-5 if i-5 > 0 else 0
				sentence = ' '.join(text[a:i+5])
				doc.add(Field("text", sentence, Field.Store.YES, Field.Index.ANALYZED))
				writer.addDocument(doc)
			print("Done %s" % (path+filen))
			print "Indexed (%d docs in index)" % (writer.numDocs())
	print "Closing index of %d docs..." % writer.numDocs()
	writer.close()
示例#5
0
 def initializeAnalyzer(self):
   #self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT,JavaSet(stopSet))
   sSet = CharArraySet(Version.LUCENE_CURRENT, 0, True)
   for entry in stopSet:
     sSet.add(entry)
   self.stopSet = sSet
   #self.analyzer = EnglishAnalyzer(Version.LUCENE_CURRENT,sSet)
   self.analyzer = EnglishAnalyzer(Version.LUCENE_CURRENT)
示例#6
0
def get_wiki_docids(data_file, wikipedia_index):
    from questions import get_input_data

    data = get_input_data(data_file)

    lucene.initVM()
    stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
    for s in stopwords:
        stops.add(s)
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
    reader = IndexReader.open(SimpleFSDirectory(File(wikipedia_index)))
    searcher = IndexSearcher(reader)

    generate_docids(data, data_file, analyzer, searcher)
示例#7
0
def create_index(index) :
	indexDir = SimpleFSDirectory(File(index))
	stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
	for s in stopwords :
		stops.add(s)
	analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
	writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
	writer = IndexWriter(indexDir, writerConfig)

	print "%d docs in index" % writer.numDocs()
	print "Reading Documents"

	f = open('f:/nlp/data/questions/combine.txt')
	for line in f :
		line = get_data_from_text(line.decode('utf-8'))
		doc = Document()
		field = Field("text", line, Field.Store.YES, Field.Index.ANALYZED)
		field.setBoost(2.0)
		doc.add(field)
		writer.addDocument(doc)
	
	print "Indexed (%d docs in index)" % (writer.numDocs())
	print "Closing index of %d docs..." % writer.numDocs()
	writer.close()
示例#8
0
    def internal_analyzer(self):
        java_stopwords = [
            "public", "private", "protected", "interface", "abstract",
            "implements", "extends", "null", "new", "switch", "case",
            "default", "synchronized", "do", "if", "else", "break", "continue",
            "this", "assert", "for", "instanceof", "transient", "final",
            "static", "void", "catch", "try", "throws", "throw", "class",
            "finally", "return", "const", "native", "super", "while", "import",
            "package", "true", "false", "enum"
        ]

        all_stopwords = list(
            StandardAnalyzer(Version.LUCENE_CURRENT).getStopwordSet())
        all_stopwords.extend(java_stopwords)

        stopwords = CharArraySet(Version.LUCENE_CURRENT, all_stopwords, True)
        analyzer = StandardAnalyzer(Version.LUCENE_CURRENT, stopwords)
        #analyzer = KeywordAnalyzer()
        return analyzer
示例#9
0
def main(indexDir, inputDir):
	"""Creates a Lucene Index, and indexes every .json file it finds.
	It utilizes a stopwords.txt to filter out stop words"""
	lucene.initVM()

	logger.info("Loading stop words from stopwords.txt")
	f = open('stopwords.txt', 'r')
	stopwords = set([])
	for line in f:
		stopwords.add(line.strip())
	f.close()
	logger.debug('Stop words: %s' % str(stopwords))
	temp = CharArraySet(Version.LUCENE_CURRENT, 1, True)

	for stopword in stopwords:
		temp.add(stopword)

	stopwords = temp

	# Create index
	logger.info("Creating Lucene index [%s]..." % indexDir)

	dir = SimpleFSDirectory(File(indexDir))
	analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopwords)
	writerConfig = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
	writer = IndexWriter(dir, writerConfig)

	logger.info("Currently there are %d documents in the index..." % writer.numDocs())

	# Index documents
	onlyfiles = [ f for f in listdir(inputDir) if isfile(join(inputDir, f)) and f.endswith('.json') ]
	for f in onlyfiles:
		try:
			journal_code = f.split('.')[0]
			f = join(inputDir, f)
			json_data = open(f)
			data = json.load(json_data)
			for entry in data:
				doc = Document()
				doc.add(Field("journal", journal_code, Field.Store.YES, Field.Index.NOT_ANALYZED))
				doc.add(Field("url", entry['url'], Field.Store.YES, Field.Index.NOT_ANALYZED ))
				doc.add(Field("date", entry['date'], Field.Store.YES, Field.Index.NOT_ANALYZED ))
				doc.add(Field("title", entry['title'], Field.Store.YES, Field.Index.ANALYZED))
				writer.addDocument(doc)
			json_data.close()
		except (IOError) as v:
			try:
				(code, message) = v
			except:
				code = 0
				message = v
			logger.error("I/O Error: " + str(message) + " (" + str(code) + ")")
	logger.info("Indexed lines from stdin (%d documents in index)" % writer.numDocs())

	# Wrap it up
	#logger.info("About to optimize index of %d documents..." % writer.numDocs())
	#writer.optimize()
	#logger.info("...done optimizing index of %d documents" % writer.numDocs())

	logger.info("Closing index of %d documents..." % writer.numDocs())
	writer.close()

	reader = IndexReader.open(dir)
	with open('all.csv', 'wb') as csvfile:
		csvwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
		for i in xrange(0, reader.numDocs()):
			doc = reader.document(i)
			csvwriter.writerow([doc.get('journal'), doc.get('date'), doc.get('url').encode('utf8'), \
				doc.get('title').strip().replace(',', '\,').encode('utf8')])
示例#10
0
from org.apache.lucene.analysis.util import CharArraySet

if __name__ == "__main__":

    if len(sys.argv) != 4:

        print sys.argv[0] + ' <documentsDirectory> <stopWords> <indexDirectory>'
        exit()

    documentsDirectory = sys.argv[1]
    stopWords = sys.argv[2]
    indexDirectory = sys.argv[3]

    lucene.initVM()

    exclusionSet = CharArraySet(0, True)
    f = open(stopWords, 'r')
    while 1:
        line = f.readline()
        if not line: break
        exclusionSet.add(line.strip())
    f.close()

    indexDir = SimpleFSDirectory(File(indexDirectory))

    writerConfig = IndexWriterConfig(Version.LUCENE_CURRENT,
                                     SpanishAnalyzer(exclusionSet))

    writer = IndexWriter(indexDir, writerConfig)

    totalDocs = len(glob.glob(documentsDirectory + '/*.xml'))