def __init__(self): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() indexDir = SimpleFSDirectory(File(app.config['INDEX_PATH'])) self.searcher = IndexSearcher(DirectoryReader.open(indexDir)) self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) self.parser = QueryParser(Version.LUCENE_CURRENT, "contents", self.analyzer)
def document_to_query(self, doc): """ Given a document it transforms the source code related fields to a lucene query string""" query = "" for field in ["description"]: for val in doc.getFields(field): if val.stringValue().strip(): term = QueryParser.escape(val.stringValue()) #tokenize term = self.tokenize_string(StandardAnalyzer(), term) #CamelCase temp = [] for t in term: temp += self.camel_case_split(t) #stopwords temp_2 = [] for t in temp: if t not in english_stop_words: temp_2.append(t) #stemming temp_3 = [] for t in temp_2: temp_3.append(stem(t)) #stopwords temp_4 = [] for t in temp_3: if t not in english_stop_words: temp_4.append(t) #query generation for term in temp_4: query += "%s:%s " % (field, term) for field in [ "typed_method_call", "methods", "used_classes", "class_instance_creation", "methods_called" ]: # "extends", "annotations", "literals" for val in doc.getFields(field): if val.stringValue().strip(): term = QueryParser.escape(val.stringValue()) stoplist = ["java.lang.Object"] if term not in stoplist: query += "%s:%s " % (field, term) if len(doc.getFields("code_hints")) > 0: hints = [ hint.stringValue() for hint in doc.getFields("code_hints") ] hints_str = " ".join(hints) for term in hints: if term: term = QueryParser.escape(term) if term not in english_stop_words: # print "Including 'code_hints' from Doc_To_Query TERMs... //", term query += "code_hints:%s " % term return query
class SearchIndex(object): def __init__(self): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() indexDir = SimpleFSDirectory(File(app.config['INDEX_PATH'])) self.searcher = IndexSearcher(DirectoryReader.open(indexDir)) self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) self.parser = QueryParser(Version.LUCENE_CURRENT, "contents", self.analyzer) def search(self, q, page = 1, duplicates = False): query = self.parser.parse(q) if not duplicates: query = self.addDuplicatesQuery(query) perPage = 10 start = (page - 1) * perPage results = TopScoreDocCollector.create(1000, True) self.searcher.search(query, results) highlighter = Highlighter(QueryScorer(query)) highlighter.setTextFragmenter(SimpleFragmenter(40)) docs = [] for scoreDoc in results.topDocs(start, perPage).scoreDocs: doc = self.searcher.doc(scoreDoc.doc) tokenStream = self.analyzer.tokenStream("contents", StringReader(doc['contents'])) highlight = highlighter.getBestFragments(tokenStream, doc['contents'], 3, "...") docs.append({ 'title': doc['title'], 'url': doc['url'], 'duplicate': doc['duplicate'], 'highlight': highlight} ) del self.searcher totalPages = int(math.ceil(results.getTotalHits()/float(perPage))) return totalPages, docs def addDuplicatesQuery(self, query): not_duplicate = TermQuery(Term('duplicate', 'false')) booleanQuery = BooleanQuery() booleanQuery.add(not_duplicate, BooleanClause.Occur.MUST) booleanQuery.add(query, BooleanClause.Occur.MUST) return booleanQuery
def testOverrideBooleanQuery(self): class TestQueryParser(BooleanTestMixin, PythonQueryParser): def getFieldQuery_quoted(_self, field, queryText, quoted): return super(TestQueryParser, _self).getFieldQuery_quoted_super( field, queryText, quoted) qp = TestQueryParser(Version.LUCENE_CURRENT, 'all', StandardAnalyzer(Version.LUCENE_CURRENT)) q = qp.parse("foo bar") self.assertEquals(str(q), "all:foo all:bar all:extra_clause")
def __enter__(self): """ Used by "with" statement. Like an "open" / "init" method. """ if lucene.getVMEnv() is None: lucene.initVM(vmargs=['-Djava.awt.headless=true']) index_path = Path(self.index_root_loc).joinpath('%s/' % self.index_subdir_name) index_path.mkdir(parents=True, exist_ok=True) store = SimpleFSDirectory(Paths.get(str(index_path))) self.analyzer = StandardAnalyzer() config = IndexWriterConfig(self.analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) # IndexWriter self.writer = IndexWriter(store, config) # IndexReader self.reader = DirectoryReader.open(self.writer) # IndexSearcher self.searcher = IndexSearcher(self.reader) return self
def __init__(self, indexDir): if not os.path.exists(indexDir): os.mkdir(indexDir) store = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) self.writer = IndexWriter(store, config)
def __init__(self): self.env = lucene.initVM(initialheap='6g', maxheap='6g', vmargs=['-Djava.awt.headless=true']) self.vocab = None BooleanQuery.setMaxClauseCount(2048) if not os.path.exists(prm.index_folder): print('Creating index at', prm.index_folder) if prm.docs_path == prm.docs_path_term: add_terms = True else: add_terms = False self.create_index(prm.index_folder, prm.docs_path, add_terms) if prm.local_index_folder: print('copying index from', prm.index_folder, 'to', prm.local_index_folder) if os.path.exists(prm.local_index_folder): print('Folder', prm.local_index_folder, 'already exists! Doing nothing.') else: shutil.copytree(prm.index_folder, prm.local_index_folder) self.index_folder = prm.local_index_folder else: self.index_folder = prm.index_folder fsDir = MMapDirectory(Paths.get(prm.index_folder)) self.searcher = IndexSearcher(DirectoryReader.open(fsDir)) self.searcher.setSimilarity(BM25Similarity()) if prm.docs_path != prm.docs_path_term: if not os.path.exists(prm.index_folder_term): print('Creating index at', prm.index_folder_term) self.create_index(prm.index_folder_term, prm.docs_path_term, add_terms=True) if prm.local_index_folder_term: print('copying index from', prm.index_folder_term, 'to', prm.local_index_folder_term) if os.path.exists(prm.local_index_folder_term): print('Folder', prm.local_index_folder_term, 'already exists! Doing nothing.') else: shutil.copytree(prm.index_folder_term, prm.local_index_folder_term) self.index_folder_term = prm.local_index_folder_term else: self.index_folder_term = prm.index_folder_term fsDir_term = MMapDirectory(Paths.get(prm.index_folder_term)) self.searcher_term = IndexSearcher(DirectoryReader.open(fsDir_term)) self.analyzer = StandardAnalyzer() self.pool = ThreadPool(processes=prm.n_threads) self.cache = {} print('Loading Text-ID mapping...') self.text_id_map, self.id_text_map = self.get_text_id_map()
def __init__(self, path, topn=DEF_TOPN): lucene.initVM(vmargs=['-Djava.awt.headless=true']) self.path = path self.topn = topn self._analyzer = StandardAnalyzer() self._store = SimpleFSDirectory(Paths.get(os.path.abspath(self.path))) self._searcher = IndexSearcher(DirectoryReader.open(self._store)) self.purpose_is_w2v = None self.purpose_is_not_w2v = None self.mechanics_is_w2v = None self.mechanics_is_not_w2v = None
def testThroughLayerException(self): class TestException(Exception): pass class TestQueryParser(PythonQueryParser): def getFieldQuery_quoted(_self, field, queryText, quoted): raise TestException("TestException") qp = TestQueryParser('all', StandardAnalyzer()) with self.assertRaises(TestException): qp.parse("foo bar")
def init_index(self): """ Initializes the lucene index, as well as creates an StandardAnalyzer and an IndexSearcher. Returns: vm: The initialized Java VM analyzer: StandardAnalyzer word analizer. searcher: Searcher over the lucene index. """ self.vm = lucene.initVM(vmargs=['-Djava.awt.headless=true']) ldir = FSDirectory.open(Paths.get(settings.LUCENE_INDEX)) self.analyzer = StandardAnalyzer() self.searcher = IndexSearcher(DirectoryReader.open(ldir))
def func_mid(command): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() STORE_DIR = "index_tb_new" directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) query = QueryParser(Version.LUCENE_CURRENT, "mid", analyzer).parse(command) scoreDocs = searcher.search(query, 212).scoreDocs results = process(scoreDocs, searcher) return results
def __init__(self, docDir, indexDir, analyzer): #set index dir if not os.path.exists(indexDir): os.makedirs(indexDir) self.indexDir = SimpleFSDirectory(Paths.get(indexDir)) self.docDir = docDir self.analyzer = LimitTokenCountAnalyzer(StandardAnalyzer(), 1048576) writerConfig = IndexWriterConfig(self.analyzer) writerConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self.writer = IndexWriter(self.indexDir, writerConfig) self.indexing()
def index(self): """ This function is used to index the preprocessed data. The inverted index will be saved to ./index/ folder business_id, name, address, categories, review and tip data are indexed. """ print "INDEXING..." lucene.initVM() indexDir = SimpleFSDirectory(File("index/")) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer()) writer = IndexWriter(indexDir, writerConfig) # each business indexed as a document for key, business in self.data.items(): doc = Document() text = "" doc.add( Field("id", business["business_id"], Field.Store.YES, Field.Index.ANALYZED)) doc.add( Field("name", business["name"], Field.Store.YES, Field.Index.ANALYZED)) doc.add( Field("address", business["full_address"], Field.Store.YES, Field.Index.ANALYZED)) cat_text = "\n".join(business["categories"]) doc.add( Field("category", cat_text, Field.Store.YES, Field.Index.ANALYZED)) # combine all reviews of a business together review_text = "" for review in business["review"]: review_text += review["text"] # combine all tip of a business together tip_text = "" for tip in business["tip"]: tip_text += tip["text"] # concatenate the data to be indexed and add it as one field text += business["name"] text += business["full_address"] text += cat_text text += review_text text += tip_text doc.add(Field("text", text, Field.Store.YES, Field.Index.ANALYZED)) # add the business doc to writer writer.addDocument(doc) writer.close()
def create_miniindex(docs): index_store = RAMDirectory() analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(index_store, config) for doc in docs: writer.addDocument(doc) writer.commit() writer.close() return index_store
def delete(indexDir: str, id: str): index_dir = SimpleFSDirectory(Paths.get(indexDir)) config = IndexWriterConfig(StandardAnalyzer()) index_writer = IndexWriter(index_dir, config) delete_term_query = RegexpQuery(Term('id', id)) delete_reg_query = RegexpQuery(Term('id', id + '\..*')) index_writer.deleteDocuments(delete_term_query) index_writer.deleteDocuments(delete_reg_query) index_writer.commit() index_writer.close()
def get_wiki_docids(data_file, wikipedia_index): from questions import get_input_data data = get_input_data(data_file) lucene.initVM() stops = CharArraySet(Version.LUCENE_4_10_1, 0, True) for s in stopwords: stops.add(s) analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops) reader = IndexReader.open(SimpleFSDirectory(File(wikipedia_index))) searcher = IndexSearcher(reader) generate_docids(data, data_file, analyzer, searcher)
def createIndex(): print(lucene.VERSION) lucene.initVM(vmargs=['-Djava.awt.headless=true']) index = FSDirectory.open(Paths.get('index')) analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(index, config) openCSV('../output/outputSpark_full_index.csv', writer) writer.close()
def setUp(self): super(HighlighterTestCase, self).setUp() self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) writer = self.getWriter(analyzer=self.analyzer) for text in self.texts: self.addDoc(writer, text) writer.commit() writer.close() self.reader = self.getReader() self.numHighlights = 0;
def lucene_search(query, MAX, showHighlight): dir = os.getcwd() lucene.initVM() index_dir = SimpleFSDirectory(File(dir)) index_reader = DirectoryReader.open(index_dir) lucene_searcher = IndexSearcher(index_reader) lucene_analyzer = StandardAnalyzer(Version.LUCENE_48) my_query = QueryParser(Version.LUCENE_48, "text", lucene_analyzer).parse(query) #We can define the MAX number of results (default 10) total_hits = lucene_searcher.search(my_query, MAX) query_scorer = QueryScorer(my_query) formatter = SimpleHTMLFormatter() highlighter = Highlighter(formatter, query_scorer) # Set the fragment size. We break text in to fragment of 50 characters fragmenter = SimpleSpanFragmenter(query_scorer, 50) highlighter.setTextFragmenter(fragmenter) print "Only shows at most %s documents" % MAX if showHighlight: print "<br>" for hit in total_hits.scoreDocs: doc = lucene_searcher.doc(hit.doc) text = doc.get("text") ts = lucene_analyzer.tokenStream("text", StringReader(text)) if showHighlight: print "<p>" print doc.get("title") if showHighlight: print "<br>" print highlighter.getBestFragments(ts, text, 3, "...") print "</p>"
def func_nr(command): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() STORE_DIR = "index_tb_new" directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) query = QueryParser(Version.LUCENE_CURRENT, "name", analyzer).parse(command) scoreDocs = searcher.search( query, 50, Sort([SortField("rate", SortField.Type.DOUBLE, True)])).scoreDocs results = process(scoreDocs, searcher) return results
def search(music_tags, dir_path): lucene.initVM() query_str = "content:" + " ".join(music_tags) index_dir = SimpleFSDirectory(Paths.get(dir_path)) lucene_analyzer = StandardAnalyzer() lucene_searcher = IndexSearcher(DirectoryReader.open(index_dir)) my_query = QueryParser("content", lucene_analyzer).parse(query_str) total_hits = lucene_searcher.search(my_query, 50) for hit in total_hits.scoreDocs: doc = lucene_searcher.doc(hit.doc) print doc
def __init__(self, indexDir, root="testdocs"): # create and open an index writer indexDir = FSDirectory.open(Paths.get(indexDir)) # TODO make appropriate analyzer add to config analyzer = LimitTokenCountAnalyzer(StandardAnalyzer(), 1048576) config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) iw = IndexWriter(indexDir, config) self.authorcount = 0 self.titlecount = 0 self.errorcount = 0 self.indexDocs(root, iw)
def similarityOfSynopsis(self): directory = SimpleFSDirectory(File(settings.SYNOPSIS_INDEX)) ireader = DirectoryReader.open(directory) searcher = IndexSearcher(ireader) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS, analyzer) for root, dirnames, filenames in os.walk(settings.SYNOPSIS): filenames = [int(item) for item in filenames] filenames.sort() filenames = [str(item) for item in filenames] for filename in filenames: path = os.path.join(root, filename) major_movie = models.Movie.objects.get(pk=filename) with open(path, 'r') as moviedoc: content = moviedoc.read().replace('\n', ' ') content = re.sub('[^A-Za-z0-9 ]+', '', content) while True: try: query = queryParser.parse( QueryParser.escape(content)) except Exception as e: self.boolean_query.setMaxClauseCount( self.boolean_query.maxClauseCount * 2) print self.boolean_query.maxClauseCount continue break topDocs = searcher.search(query, len(filenames)) scoreDocs = topDocs.scoreDocs for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) movie_id = int(doc.get(FIELD_PATH)) if movie_id <= major_movie.id: continue minor_movie = models.Movie.objects.get(pk=movie_id) try: similarity = models.Similarities.objects.filter( first_movie=major_movie, second_movie=minor_movie).first() if not similarity: similarity = models.Similarities.objects.filter( first_movie=minor_movie, second_movie=major_movie).first() similarity.synopsis = scoreDoc.score similarity.save() except Exception as e: print major_movie.id, minor_movie.id raise e print u"{0} completed.".format(major_movie.id)
def indexing(directory): # lucene.initVM(classpath=lucene.CLASSPATH) lucene.initVM(vmargs=['-Djava.awt.headless=true']) print 'lucene', lucene.VERSION start = datetime.now() try: base_dir = os.path.dirname(os.path.abspath(sys.argv[0])) IndexFiles(directory, os.path.join(base_dir, INDEX_DIR), StandardAnalyzer()) end = datetime.now() print end - start except Exception, e: print "Failed: ", e raise e
def populate_data(path, args): name = path.split('/')[-1] print(f"Processing {name}") all_senses = {} all_senses[args.lang1] = {} all_senses[args.lang2] = {} if args.pivot_lang is not None: all_senses[args.pivot_lang] = {} all_translation_mappings = [] if args.pivot_lang is not None: all_translation_pivot1_mappings = [] all_translation_pivot2_mappings = [] store = SimpleFSDirectory(Paths.get(path)) dr = DirectoryReader.open(store) searcher = IndexSearcher(dr) analyzer = StandardAnalyzer() query = QueryParser("title", analyzer).parse("*:*") topDocs = searcher.search(query, 1000000000) for scoreDoc in topDocs.scoreDocs: doc = scoreDoc.doc language_lemmas = searcher.doc(doc).getValues("LANGUAGE_LEMMA") sense_ids = searcher.doc(doc).getValues("ID_SENSE") for language_lemma, sense_id in zip(language_lemmas, sense_ids): lang = language_lemma[:2] lemma = language_lemma[3:] if language_lemma[:2] in LANGUAGES_OF_INTEREST: all_senses[lang] = {sense_id: lemma} if args.pivot_lang is not None and language_lemma[:2] == args.pivot_lang: all_senses[args.pivot_lang] = {sense_id: lemma} translation_mappings = searcher.doc(doc).getValues("TRANSLATION_MAPPING") create_translation_mapping(translation_mappings, all_senses, all_translation_mappings, LANGUAGES_OF_INTEREST) if args.pivot_lang is not None: create_translation_mapping(translation_mappings, all_senses, all_translation_pivot1_mappings, [args.lang1, args.pivot_lang]) create_translation_mapping(translation_mappings, all_senses, all_translation_pivot2_mappings, [args.lang2, args.pivot_lang]) output = open(f'{args.internal_data_path}/{name}.pkl', 'wb') pickle.dump(all_translation_mappings, output) output.close() if args.pivot_lang is not None: output = open(f'{args.internal_data_path}/{name}_{args.lang1}-{args.pivot_lang}.pkl', 'wb') pickle.dump(all_translation_pivot1_mappings, output) output.close() output = open(f'{args.internal_data_path}/{name}_{args.lang2}-{args.pivot_lang}.pkl', 'wb') pickle.dump(all_translation_pivot2_mappings, output) output.close()
def GET(self, name): STORE_DIR_GOOD = "index_good" STORE_DIR_BAD = "index_bad" vm_env.attachCurrentThread() directory_good = SimpleFSDirectory(File(STORE_DIR_GOOD)) searcher_good = IndexSearcher(DirectoryReader.open(directory_good)) directory_bad = SimpleFSDirectory(File(STORE_DIR_BAD)) searcher_bad = IndexSearcher(DirectoryReader.open(directory_bad)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) user_data = web.input(name=None) command = yourInput(user_data.shop) res = Run_GoodRate(searcher_good, searcher_bad, analyzer, command, user_data.brand) res.append(command) return render.SearchResult(res)
def main(): lucene.initVM(vmargs=['-Djava.awt.headless=true']) queries = makeQueryList(args["queryFile"]) print 'lucene', lucene.VERSION print "\n" directory = SimpleFSDirectory(Paths.get(os.getcwd(), INDEX_DIR)) print directory.getDirectory() searcher = IndexSearcher(DirectoryReader.open(directory)) searcher.setSimilarity(ClassicSimilarity()) analyzer = StandardAnalyzer() run(searcher, analyzer, queries) del searcher
def doc_search(self, keywords): analyzer = StandardAnalyzer() parser = QueryParser('Title', analyzer) query = parser.parse(keywords) try: collector = TopScoreDocCollector.create(3000) self.lSearcher.search(query, collector) hits = collector.topDocs().scoreDocs except RuntimeError: print "Score docoment run fail" self.hits = hits return hits
def search(self, query): lucene.initVM() luceneDirectory = "/index/" path = str(os.path.abspath(os.getcwd()) + luceneDirectory) directory = FSDirectory.open(Paths.get(path)) reader = DirectoryReader.open(directory) searcher = IndexSearcher(reader) analyzer = StandardAnalyzer() #args = len(sys.argv) - 1 #if args < 1: # print ("\n No query was submitted! \n") #else: #query_string = "" #position = 1 #while(args >= position): #query_string = query_string + str(sys.argv[position]) + " " #position = position + 1 print("Searching for '" + query + "'") fields_to_search = ["text", "page title", "date"] filter_date = 'date:"May 25"' filtered_query = filter_date + "AND " + query parser = MultiFieldQueryParser(fields_to_search, analyzer) updated_query = MultiFieldQueryParser.parse(parser, filtered_query) scored_documents = searcher.search(updated_query, 10).scoreDocs # array of docs print("Found " + str((len(scored_documents))) + " matches in the collection.") results = [] for doc in scored_documents: scoredTweet = dict() scoredTweet['score'] = doc.score result = searcher.doc(doc.doc) scoredTweet['username'] = result.get("username") scoredTweet['tweet_body'] = result.get("text") scoredTweet['date'] = result.get("date") results.append(scoredTweet) print(scoredTweet) return results
def index(infile, TitlemagNoneZijn, limit=10000): try: context = etree.iterparse(infile) except: print("cannot open file: {}".format(infile)) return store = openStore() writer = None p = re.compile(r'<.*?>') try: analyzer = StandardAnalyzer() writer = getWriter(store, analyzer, True) counter = 0 countwithtitle = 0 for event, elem in context: if (counter > limit): break ## debugging info om te kijken hoe snel er word ge indext # print(counter) # print(countwithtitle) counter += 1 doc = Document() hasTitle = False for key in elem.attrib: # neem een paar tags if key in [ "CreationDate", "Score", "Body", "CommentCount", "LastActivityDate", "Id", "Tags", "Title", "AnswerCount", "FavoriteCount" ]: if key == "Title": # print(elem.attrib[key]) countwithtitle += 1 hasTitle = True doc.add( Field(key, p.sub('', elem.attrib[key]), TextField.TYPE_STORED)) if (hasTitle or TitlemagNoneZijn): writer.addDocument(doc) elem.clear() for ancestor in elem.xpath('ancestor-or-self::*'): while ancestor.getprevious() is not None: del ancestor.getparent()[0] finally: del context writer.close()
def main(): try: print "Indexing..." ######################################### 경 로 #################################### indexDestination = File( "/Users/Falcon/Desktop/New_Indices/Stack_A_Indices") #writer = IndexWriter(SimpleFSDirectory(indexDestination), StandardAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) analyzer = PorterAnalyzer(StandardAnalyzer(Version.LUCENE_CURRENT)) a = { "typed_method_call": analyzer, "extends": analyzer, "used_classes": analyzer, "methods": analyzer, "class_instance_creation": analyzer, "methods_called": analyzer, "view_count": KeywordAnalyzer(), "code_hints": JavaCodeAnalyzer() } wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a) config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer) writer = IndexWriter(SimpleFSDirectory(indexDestination), config) # analyzer = PorterAnalyzer(StandardAnalyzer(Version.LUCENE_CURRENT)) # a = {"typed_method_call": KeywordAnalyzer(), "extends": KeywordAnalyzer(), # "used_classes": KeywordAnalyzer(), "methods": KeywordAnalyzer(), # "class_instance_creation": KeywordAnalyzer(), "methods_called": KeywordAnalyzer(), # "view_count": KeywordAnalyzer(), "code_hints": JavaCodeAnalyzer()} # wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a) # config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer) # writer = IndexWriter(SimpleFSDirectory(indexDestination), config) counter = Counter() index_code_snippet(writer, counter) writer.commit() writer.close() print "Done" print str(counter) except CorruptIndexException as e: #when index is corrupt e.printStackTrace() except LockObtainFailedException as e: #when other writer is using the index e.printStackTrace() except IOException as e: #when directory can't be read/written e.printStackTrace() except SQLException as e: #when Database error occurs e.printStackTrace()
def setUp(self): super(BooleanOrTestCase, self).setUp() # add the doc to a ram index writer = self.getWriter(analyzer=StandardAnalyzer(Version.LUCENE_CURRENT)) d = Document() d.add(Field(self.FIELD_T, "Optimize not deleting all files", TextField.TYPE_STORED)) d.add(Field(self.FIELD_C, "Deleted When I run an optimize in our production environment.", TextField.TYPE_STORED)) writer.addDocument(d) writer.close() self.searcher = self.getSearcher()
def rollback(collection_name): if collection_name != "DEFAULT": INDEX_DIR = collection_name else: INDEX_DIR = INDEX_DIR_DEFAULT direc = SimpleFSDirectory(File(INDEX_DIR)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) #setting writer configurations config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) writer = IndexWriter(direc, config) writer.rollback() writer.close()
def delete(primary_keys_map, collection_name, todelete, commit=False): INDEX_DIR_DEFAULT = "IndexFiles.index" if collection_name != "DEFAULT": INDEX_DIR = collection_name else: INDEX_DIR = INDEX_DIR_DEFAULT try: tofind_keyvalue_pairs = json.loads(todelete) except: return 100 direc = SimpleFSDirectory(File(INDEX_DIR)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) #setting writer configurations try: config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) writer = IndexWriter(direc, config) ireader = IndexReader.open(direc) except: return 105 ###as of now deletion of documents support is only based on indexed keys.###################3 tofind_primary_keyvalue_pairs = {} tofind_nonprimary_keyvalue_pairs = {} #separating out primary and non_primary keys for key in tofind_keyvalue_pairs.keys(): if key in primary_keys_map: tofind_primary_keyvalue_pairs[key] = tofind_keyvalue_pairs[key] else: tofind_nonprimary_keyvalue_pairs[key] = tofind_keyvalue_pairs[key] #filtering documents according to primary keys query = BooleanQuery() for key in tofind_primary_keyvalue_pairs.keys(): temp = QueryParser(Version.LUCENE_CURRENT, key, analyzer).parse(tofind_primary_keyvalue_pairs[key]) query.add(BooleanClause(temp, BooleanClause.Occur.MUST)) a = writer.deleteDocuments(query) if commit == True: writer.commit() writer.close() return 000
class PorterStemmerAnalyzer(PythonAnalyzer): def createComponents(self, fieldName, reader): source = StandardTokenizer(Version.LUCENE_CURRENT, reader) filter = StandardFilter(Version.LUCENE_CURRENT, source) filter = LowerCaseFilter(Version.LUCENE_CURRENT, filter) filter = PorterStemFilter(filter) filter = StopFilter(Version.LUCENE_CURRENT, filter, StopAnalyzer.ENGLISH_STOP_WORDS_SET) return self.TokenStreamComponents(source, filter) lucene.initVM(vmargs=['-Djava.awt.headless=true']) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) input = 'this is a test string for Analyzer' ts = analyzer.tokenStream("dummy", StringReader(input)) #matchVersion = Version.LUCENE_XY; ##Substitute desired Lucene version for XY offsetAtt = ts.addAttribute(OffsetAttribute.class_) termAtt = ts.addAttribute(CharTermAttribute.class_) #posAtt = ts.addAttribute(PartOfSpeechAttribute.class_) def testStandard(): ts.reset(); ##Resets this stream to the beginning. (Required while ts.incrementToken(): #print ts.r #print ts.reflectAsString(True) print offsetAtt.startOffset()
class HighlighterTestCase(PyLuceneTestCase): """ Unit tests ported from Java Lucene. 2004 by Yura Smolsky ;) """ FIELD_NAME = "contents" texts = [ "A wicked problem is one for which each attempt to create a solution changes the understanding of the problem. Wicked problems cannot be solved in a traditional linear fashion, because the problem definition evolves as new possible solutions are considered and/or implemented." "Wicked problems always occur in a social context -- the wickedness of the problem reflects the diversity among the stakeholders in the problem." "From http://cognexus.org/id42.htm" "Most projects in organizations -- and virtually all technology-related projects these days -- are about wicked problems. Indeed, it is the social complexity of these problems, not their technical complexity, that overwhelms most current problem solving and project management approaches." "This text has a typo in referring to whicked problems" ]; def __init__(self, *args): super(HighlighterTestCase, self).__init__(*args) self.parser = QueryParser(Version.LUCENE_CURRENT, self.FIELD_NAME, StandardAnalyzer(Version.LUCENE_CURRENT)) def setUp(self): super(HighlighterTestCase, self).setUp() self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) writer = self.getWriter(analyzer=self.analyzer) for text in self.texts: self.addDoc(writer, text) writer.commit() writer.close() self.reader = self.getReader() self.numHighlights = 0; def testSimpleHighlighter(self): self.doSearching("Wicked") highlighter = Highlighter(QueryScorer(self.query)) highlighter.setTextFragmenter(SimpleFragmenter(40)) maxNumFragmentsRequired = 2 for scoreDoc in self.scoreDocs: text = self.searcher.doc(scoreDoc.doc).get(self.FIELD_NAME) tokenStream = self.analyzer.tokenStream(self.FIELD_NAME, StringReader(text)) result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, "...") print "\t", result # Not sure we can assert anything here - just running to check we don't # throw any exceptions def testGetBestFragmentsSimpleQuery(self): self.doSearching("Wicked") self.doStandardHighlights() self.assert_(self.numHighlights == 3, ("Failed to find correct number of highlights, %d found" %(self.numHighlights))) def doSearching(self, queryString): self.searcher = self.getSearcher() self.query = self.parser.parse(queryString) # for any multi-term queries to work (prefix, wildcard, range, # fuzzy etc) you must use a rewritten query! self.query = self.query.rewrite(self.reader) print "Searching for:", self.query.toString(self.FIELD_NAME) self.scoreDocs = self.searcher.search(self.query, 100).scoreDocs self.numHighlights = 0 def doStandardHighlights(self): formatter = TestFormatter(self) highlighter = Highlighter(formatter, QueryScorer(self.query)) highlighter.setTextFragmenter(SimpleFragmenter(20)) for scoreDoc in self.scoreDocs: text = self.searcher.doc(scoreDoc.doc).get(self.FIELD_NAME) maxNumFragmentsRequired = 2 fragmentSeparator = "..." tokenStream = self.analyzer.tokenStream(self.FIELD_NAME, StringReader(text)) result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, fragmentSeparator) print "\t", result def countHighlightTerm(self): self.numHighlights += 1 # update stats used in assertions def addDoc(self, writer, text): d = Document() f = Field(self.FIELD_NAME, text, TextField.TYPE_STORED) d.add(f) writer.addDocument(d)