def retrival_answer(MAX): lucene.initVM() directory = RAMDirectory() indexDir = SimpleFSDirectory(Paths.get('index')) writerConfig = IndexWriterConfig(StandardAnalyzer()) writer = IndexWriter(directory, writerConfig) print "%d docs in index" % writer.numDocs() print "Reading lines from Document..." process_doc = open("Huawei_result/document.txt", "r") doc_line = process_doc.readlines() for l in doc_line: doc = Document() doc.add(TextField("text", l, Field.Store.YES)) writer.addDocument(doc) print "Indexed from %d docs in index" % (writer.numDocs()) print "Closing index of %d docs..." % writer.numDocs() writer.close() accuracy = [] process_query = open("Huawei_result/query.txt", "r") query_line = process_query.readlines() for n, one_query in enumerate(query_line): analyzer = StandardAnalyzer() # reader = IndexReader.open(SimpleFSDirectory(Paths.get('index'))) searcher = IndexSearcher(DirectoryReader.open(directory)) # searcher = IndexSearcher(reader) query = QueryParser("text", analyzer).parse(one_query) hits = searcher.search(query, MAX) # print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) # print "The groundtruth document is:", doc_line[n] candidate_doc = [] for hit in hits.scoreDocs: # print hit.score, hit.doc, hit.toString() doc = searcher.doc(hit.doc) # print doc.get("text").encode("utf-8") candidate_doc.append(doc.get("text")) choices = process.extract(unicode(doc_line[n]), candidate_doc) flag = 0 for i in range(len(choices)): if choices[i][1] >= 89: flag = 1 if flag == 1: accuracy.append(1) else: accuracy.append(0) final_accuracy = float(sum(accuracy)) / float(len(accuracy)) print "the final accuracy is:", final_accuracy
def main(index_dir, input_dir): """Creates a Lucene Index, and indexes every .json file it finds. It utilizes a stopwords.txt to filter out stop words""" lucene.initVM() logger.info("Loading stop words from stopwords.txt") f = open('stopwords.txt', 'r') stopwords = set([]) for line in f: stopwords.add(line.strip()) f.close() logger.debug('Stop words: %s' % str(stopwords)) temp = CharArraySet(1, True) for stopword in stopwords: temp.add(stopword) stopwords = temp # Create index logger.info("Creating Lucene index [%s]..." % index_dir) fs_dir = SimpleFSDirectory(Paths.get(index_dir)) analyzer = StandardAnalyzer(stopwords) writerConfig = IndexWriterConfig(analyzer) writer = IndexWriter(fs_dir, writerConfig) logger.info("Currently there are %d documents in the index..." % writer.numDocs()) # Index documents onlyfiles = [ f for f in listdir(input_dir) if isfile(join(input_dir, f)) and f.endswith('.json') ] for f in onlyfiles: try: journal_code = f.split('.')[0] f = join(input_dir, f) json_data = open(f) data = json.load(json_data) for entry in data: doc = Document() doc.add(StringField("journal", journal_code, Field.Store.YES)) doc.add(StringField("url", entry['url'], Field.Store.YES)) doc.add(StringField("date", entry['date'], Field.Store.YES)) doc.add(TextField("title", entry['title'], Field.Store.YES)) writer.addDocument(doc) json_data.close() except IOError as v: try: (code, message) = v except (TypeError, ValueError): code = 0 message = v logger.error("I/O Error: " + str(message) + " (" + str(code) + ")") logger.info("Indexed lines from stdin (%d documents in index)" % writer.numDocs()) # Wrap it up # logger.info("About to optimize index of %d documents..." % writer.numDocs()) # writer.optimize() # logger.info("...done optimizing index of %d documents" % writer.numDocs()) logger.info("Closing index of %d documents..." % writer.numDocs()) writer.close() reader = DirectoryReader.open(fs_dir) with open('all.csv', 'w') as csvfile: csvwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL) for i in range(0, reader.numDocs()): doc = reader.document(i) csvwriter.writerow([ doc.get('journal'), doc.get('date'), doc.get('url'), doc.get('title').strip().replace(',', '\,') ])
def main(indexDir, inputDir): """Creates a Lucene Index, and indexes every .json file it finds. It utilizes a stopwords.txt to filter out stop words""" lucene.initVM() logger.info("Loading stop words from stopwords.txt") f = open('stopwords.txt', 'r') stopwords = set([]) for line in f: stopwords.add(line.strip()) f.close() logger.debug('Stop words: %s' % str(stopwords)) temp = CharArraySet(Version.LUCENE_CURRENT, 1, True) for stopword in stopwords: temp.add(stopword) stopwords = temp # Create index logger.info("Creating Lucene index [%s]..." % indexDir) dir = SimpleFSDirectory(File(indexDir)) analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopwords) writerConfig = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) writer = IndexWriter(dir, writerConfig) logger.info("Currently there are %d documents in the index..." % writer.numDocs()) # Index documents onlyfiles = [ f for f in listdir(inputDir) if isfile(join(inputDir, f)) and f.endswith('.json') ] for f in onlyfiles: try: journal_code = f.split('.')[0] f = join(inputDir, f) json_data = open(f) data = json.load(json_data) for entry in data: doc = Document() doc.add(Field("journal", journal_code, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("url", entry['url'], Field.Store.YES, Field.Index.NOT_ANALYZED )) doc.add(Field("date", entry['date'], Field.Store.YES, Field.Index.NOT_ANALYZED )) doc.add(Field("title", entry['title'], Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) json_data.close() except (IOError) as v: try: (code, message) = v except: code = 0 message = v logger.error("I/O Error: " + str(message) + " (" + str(code) + ")") logger.info("Indexed lines from stdin (%d documents in index)" % writer.numDocs()) # Wrap it up #logger.info("About to optimize index of %d documents..." % writer.numDocs()) #writer.optimize() #logger.info("...done optimizing index of %d documents" % writer.numDocs()) logger.info("Closing index of %d documents..." % writer.numDocs()) writer.close() reader = IndexReader.open(dir) with open('all.csv', 'wb') as csvfile: csvwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL) for i in xrange(0, reader.numDocs()): doc = reader.document(i) csvwriter.writerow([doc.get('journal'), doc.get('date'), doc.get('url').encode('utf8'), \ doc.get('title').strip().replace(',', '\,').encode('utf8')])
# Criação de diferentes campos para os diferentes campos parseados e adição desses documentos no index. addDoc("name", r['name'], writer) addDoc("research", r['research'], writer) writer.commit() writer.close() # Nesse momento é realizada a busca dos termos dentro do índice. searcher = IndexSearcher(DirectoryReader.open(store)) query = FuzzyQuery(Term("research", "programaçao")) MAX = 1000 hits = searcher.search(query, MAX) for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) professorList.append(doc.get("name")) app = Flask(__name__) @app.route('/') def index(): return render_template('home.html') @app.route('/contents', methods=['POST']) def contents(): term = request.form['content'] return render_template('contents.html', term=term,