def main(): try: indicesDestination = File(dest_path) analyzer = KeywordAnalyzer() porter_analyzer = PorterAnalyzer( StandardAnalyzer(Version.LUCENE_CURRENT)) a = { "code": porter_analyzer, "description": porter_analyzer, "typed_method_call": KeywordAnalyzer(), "extends": KeywordAnalyzer(), "used_classes": KeywordAnalyzer(), "methods": KeywordAnalyzer(), "class_instance_creation": KeywordAnalyzer(), "id": KeywordAnalyzer(), "literals": porter_analyzer, "word": KeywordAnalyzer() } wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a) config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer) writer = IndexWriter(SimpleFSDirectory(indicesDestination), config) counter = Counter() generate_indices_from_benchmark(writer, counter) writer.close() print "All jobs are done.." print str(counter) except CorruptIndexException as e: #when index is corrupt e.printStackTrace() except LockObtainFailedException as e: #when other writer is using the index e.printStackTrace() except IOException as e: #when directory can't be read/written e.printStackTrace()
def main(): INDEX_DIR = "indexes" try: print "Indexing..." indexDir = File("/home/ubuntu/Desktop/CoCaBu_remote/GitSearch/Indices") #writer = IndexWriter(SimpleFSDirectory(indexDir), StandardAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) analyzer = KeywordAnalyzer( ) #PorterAnalyzer( StandardAnalyzer(Version.LUCENE_CURRENT)) a = { "code": JavaCodeAnalyzer(), "comments": EnglishAnalyzer(Version.LUCENE_CURRENT) } wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a) config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer) writer = IndexWriter(SimpleFSDirectory(indexDir), config) index_code_snippet(writer) writer.close() except CorruptIndexException as e: #when index is corrupt e.printStackTrace() except LockObtainFailedException as e: #when other writer is using the index e.printStackTrace() except IOException as e: #when directory can't be read/written e.printStackTrace()
def main(): try: print "Indexing starts..." # indicesDestination = File("/Users/Falcon/Desktop/dyclink_2014")############################################ indicesDestination = File("/Indices/dyclink/2014") analyzer = KeywordAnalyzer( ) #전체 텍스트를 하나의 토큰으로 다룬다. (즉, Analyze 하지 않는 것과 결과적으로 동일하다.) a = { "code": JavaCodeAnalyzer(), "comments": EnglishAnalyzer(Version.LUCENE_CURRENT) } #PerFieldAnalyzerWrapper를 사용하기 위한 map 생성 (Python 에서는 Dict()) wrapper_analyzer = PerFieldAnalyzerWrapper( analyzer, a ) #http://svn.apache.org/viewvc/lucene/pylucene/trunk/test/test_PerFieldAnalyzerWrapper.py?revision=1757704&view=co config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer) writer = IndexWriter(SimpleFSDirectory(indicesDestination), config) counter = Counter() generate_indices_from_projects(writer, counter) writer.close() print "Done" print str(counter) except CorruptIndexException as e: #when index is corrupt e.printStackTrace() except LockObtainFailedException as e: #when other writer is using the index e.printStackTrace() except IOException as e: #when directory can't be read/written e.printStackTrace()
def main(): try: print "Indexing..." indexDestination = File("/Users/Falcon/Desktop/New_Indices/Stack_Q_Indices") # writer = IndexWriter(SimpleFSDirectory(indexDestination), StandardAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) analyzer = PorterAnalyzer(StandardAnalyzer(Version.LUCENE_CURRENT)) a = {"typed_method_call": KeywordAnalyzer(), "extends": KeywordAnalyzer(), "used_classes": KeywordAnalyzer(), "methods": KeywordAnalyzer(), "class_instance_creation": KeywordAnalyzer(), "methods_called": KeywordAnalyzer(), "view_count": KeywordAnalyzer(), "code_hints": JavaCodeAnalyzer()} #KeywordAnalyzer : 필드의 전체 원문을 하나의 토큰으로 처리 wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a) #PerFieldAnalyzerWrapper : 필드별로 분석기를 지정하는 기능을 지원하는 클래스 config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer) config.setInfoStream(System.out) # 루씬 색인작업 디버깅 // 루크라는 도구를 사용해서 루씬 색인 관리를 할 수도 있음.. writer = IndexWriter(SimpleFSDirectory(indexDestination), config) counter = Counter() index_code_snippet(writer, counter) writer.commit() writer.close() print "Done" print str(counter) except CorruptIndexException as e: # when index is corrupt e.printStackTrace() except LockObtainFailedException as e: # when other writer is using the index e.printStackTrace() except IOException as e: # when directory can't be read/written e.printStackTrace() except SQLException as e: # when Database error occurs e.printStackTrace()
def main(): INDEX_DIR = "indexes" try: print "Indexing..." indexDir = File("/Users/Raphael/Downloads/stackoverflow1107") #writer = IndexWriter(SimpleFSDirectory(indexDir), StandardAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) analyzer = PorterAnalyzer(StandardAnalyzer(Version.LUCENE_CURRENT)) a = { "typed_method_call": KeywordAnalyzer(), "extends": KeywordAnalyzer(), "used_classes": KeywordAnalyzer(), "methods": KeywordAnalyzer(), "class_instance_creation": KeywordAnalyzer(), "methods_called": KeywordAnalyzer(), "view_count": KeywordAnalyzer(), "code_hints": JavaCodeAnalyzer() } wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a) config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer) writer = IndexWriter(SimpleFSDirectory(indexDir), config) index_code_snippet(writer) writer.commit() writer.close() print "Done" except CorruptIndexException as e: #when index is corrupt e.printStackTrace() except LockObtainFailedException as e: #when other writer is using the index e.printStackTrace() except IOException as e: #when directory can't be read/written e.printStackTrace() except SQLException as e: #when Database error occurs e.printStackTrace()
def load_index(self): indexDir = File(self.index_path) a = {"code": self.porter_analyzer} self.analyzer = PerFieldAnalyzerWrapper(KeywordAnalyzer(), a) index = SimpleFSDirectory(indexDir) self.reader = IndexReader.open(index) n_docs = self.reader.numDocs() self.searcher = IndexSearcher(self.reader) print("Index contains %d documents." % n_docs)
def __init__(self, source, index_path): self.index_path = index_path self.source = source ast, source = parse(self.source, resolve=True, source=True) self.source = source self.ast = ast self.queryparser = QueryParser(Version.LUCENE_CURRENT, "typed_method_call", KeywordAnalyzer()) self.load_index()
def __init__(self, host, user, passwd, name): self.conn = mysql.connect(host, user, passwd, name, charset='utf8') self.cursor = self.conn.cursor() STORE_DIR = "fp_index" lucene.initVM(vmargs=['-Djava.awt.headless=true']) print 'lucene', lucene.VERSION # base_dir = os.path.dirname(os.path.abspath(sys.argv[0])) directory = SimpleFSDirectory(File(STORE_DIR)) self.searcher = IndexSearcher(DirectoryReader.open(directory)) self.analyzer = KeywordAnalyzer()
def load_index(self): a = {"code": self.porter_analyzer, "description": self.porter_analyzer, "typed_method_call": KeywordAnalyzer(), "extends": KeywordAnalyzer(), "used_classes": KeywordAnalyzer(), "methods": KeywordAnalyzer(), "class_instance_creation": KeywordAnalyzer(), "id": KeywordAnalyzer(), "literals": self.porter_analyzer} self.analyzer = PerFieldAnalyzerWrapper(KeywordAnalyzer(), a) self.directory = SimpleFSDirectory(self.index_path) self.reader = DirectoryReader.open(self.directory) self.searcher = IndexSearcher(self.reader)
def load_index(self): a = { "code": self.porter_analyzer, "description": self.porter_analyzer, "typed_method_call": KeywordAnalyzer(), "extends": KeywordAnalyzer(), "used_classes": KeywordAnalyzer(), "methods": KeywordAnalyzer(), "class_instance_creation": KeywordAnalyzer(), "id": KeywordAnalyzer(), "literals": self.porter_analyzer } self.analyzer = PerFieldAnalyzerWrapper(KeywordAnalyzer(), a)
def main(): try: print "Indexing..." ######################################### 경 로 #################################### indexDestination = File( "/Users/Falcon/Desktop/New_Indices/Stack_A_Indices") #writer = IndexWriter(SimpleFSDirectory(indexDestination), StandardAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) analyzer = PorterAnalyzer(StandardAnalyzer(Version.LUCENE_CURRENT)) a = { "typed_method_call": analyzer, "extends": analyzer, "used_classes": analyzer, "methods": analyzer, "class_instance_creation": analyzer, "methods_called": analyzer, "view_count": KeywordAnalyzer(), "code_hints": JavaCodeAnalyzer() } wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a) config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer) writer = IndexWriter(SimpleFSDirectory(indexDestination), config) # analyzer = PorterAnalyzer(StandardAnalyzer(Version.LUCENE_CURRENT)) # a = {"typed_method_call": KeywordAnalyzer(), "extends": KeywordAnalyzer(), # "used_classes": KeywordAnalyzer(), "methods": KeywordAnalyzer(), # "class_instance_creation": KeywordAnalyzer(), "methods_called": KeywordAnalyzer(), # "view_count": KeywordAnalyzer(), "code_hints": JavaCodeAnalyzer()} # wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a) # config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer) # writer = IndexWriter(SimpleFSDirectory(indexDestination), config) counter = Counter() index_code_snippet(writer, counter) writer.commit() writer.close() print "Done" print str(counter) except CorruptIndexException as e: #when index is corrupt e.printStackTrace() except LockObtainFailedException as e: #when other writer is using the index e.printStackTrace() except IOException as e: #when directory can't be read/written e.printStackTrace() except SQLException as e: #when Database error occurs e.printStackTrace()
def search_upper_title_filter(id: str, seacher: IndexSearcher, titleFields: list, type: int): sear = seacher if type == 0: upper_id = id[0:id.rfind('.')] else: upper_id = id[0:id.find('.')] query = QueryParser('id', KeywordAnalyzer()).parse(upper_id) hits = sear.search(query, 1) res = '' for hit in hits.scoreDocs: doc = sear.doc(hit.doc) if type == 0: res = doc.get('section') else: res = doc.get('document') if res in titleFields: return True return False
def searchAncient(self, field): indexDir = SimpleFSDirectory(Paths.get(self._dir)) sear = IndexSearcher(DirectoryReader.open(indexDir)) bq = BooleanQuery.Builder() q = QueryParser(field, StandardAnalyzer()).parse(make_parser(self._searchWord)) bc = BooleanClause(q, BooleanClause.Occur.MUST) bq.add(bc) search_fields = self._fields for i in search_fields: if i == 'section' or i == 'document': continue queryx = QueryParser(i, KeywordAnalyzer()).parse(make_ancient_parser(search_fields[i])) bc = BooleanClause(queryx, BooleanClause.Occur.MUST) bq.add(bc) query = bq.build() hits = sear.search(query, 9999) for hit in hits.scoreDocs: doc = sear.doc(hit.doc) res = doc.get(field) id = doc.get('id') detail = get_detail(doc) zhujie = detail['zhujie'] if detail['detail'] and 'detail' in detail['detail'].keys(): detail['detail'] = detail['detail']['detail'] detail.pop('zhujie') detail.pop('text') detail.pop('type') detail = json.dumps(detail) self._doc[id] = res if doc_hit(res, self._words): f = key_filter(self._words, self._re, res) if f: if 'section' in search_fields.keys(): if not search_upper_title_filter(id, sear, search_fields['section'], 0): continue if 'document' in search_fields.keys(): if not search_upper_title_filter(id, sear, search_fields['document'], 1): continue self._match.append(f) self._resultSentencesList.append((id, res, detail, zhujie)) print(res) print(self._match) return self
def load_index(self): a = { "code": self.porter_analyzer, "description": self.porter_analyzer, "typed_method_call": KeywordAnalyzer(), "extends": KeywordAnalyzer(), "used_classes": KeywordAnalyzer(), "methods": KeywordAnalyzer(), "class_instance_creation": KeywordAnalyzer(), "id": KeywordAnalyzer(), "literals": self.porter_analyzer } self.analyzer = PerFieldAnalyzerWrapper(KeywordAnalyzer(), a) self.directory = SimpleFSDirectory(self.index_path) self.searchermgr = SearcherManager(self.directory, SearcherFactory()) self.searchermgr.maybeRefresh() self.searcher = self.searchermgr.acquire()
def get_tf_idf(self, field_name: str, content_id: str): """ Calculates the tf-idf for the words contained in the field of the content whose id is content_id Args: field_name (str): Name of the field containing the words for which calculate the tf-idf content_id (str): Id of the content that contains the specified field Returns: words_bag (Dict <str, float>): Dictionary whose keys are the words contained in the field, and the corresponding values are the tf-idf values. """ searcher = IndexSearcher( DirectoryReader.open(SimpleFSDirectory(Paths.get(self.directory)))) query = QueryParser("testo_libero", KeywordAnalyzer()).parse("content_id:\"" + content_id + "\"") score_docs = searcher.search(query, 1).scoreDocs document_offset = -1 for score_doc in score_docs: document_offset = score_doc.doc reader = searcher.getIndexReader() words_bag = {} term_vector = reader.getTermVector(document_offset, field_name) term_enum = term_vector.iterator() for term in BytesRefIterator.cast_(term_enum): term_text = term.utf8ToString() postings = term_enum.postings(None) postings.nextDoc() term_frequency = 1 + math.log10( postings.freq()) # normalized term frequency inverse_document_frequency = math.log10( reader.maxDoc() / reader.docFreq(Term(field_name, term))) tf_idf = term_frequency * inverse_document_frequency words_bag[term_text] = tf_idf reader.close() return words_bag
def main(src, dst): try: start_time = time.time() print "Indexing starts..." indicesDestination = File(dst) #writer = IndexWriter(SimpleFSDirectory(indexDestination), StandardAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) #Analyzer : 본문이나 제목 등의 텍스트를 색인하기 전에 반드시 분석기를 거쳐 단어로 분리해야 한다. Analyzer 클래스는 Directory와 함께 IndexWrite 클래스의 생성 메소드에 지정하며 지정된 텍슽트를 색인할 단위 단어로 분리하고 필요 없는 단어를 제거하는 등의 역할을 담당 analyzer = KeywordAnalyzer( ) #전체 텍스트를 하나의 토큰으로 다룬다. (즉, Analyze 하지 않는 것과 결과적으로 동일하다.) a = { "code": JavaCodeAnalyzer(), "comments": EnglishAnalyzer(Version.LUCENE_CURRENT) } #PerFieldAnalyzerWrapper를 사용하기 위한 map 생성 (Python 에서는 Dict()) wrapper_analyzer = PerFieldAnalyzerWrapper( analyzer, a ) #http://svn.apache.org/viewvc/lucene/pylucene/trunk/test/test_PerFieldAnalyzerWrapper.py?revision=1757704&view=co config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer) writer = IndexWriter(SimpleFSDirectory(indicesDestination), config) #SimpleFSDirectory 옵션은 파일시스템에 특정 디렉토리에 인덱스 파일을 저장하겠다. DB, RAM, File system 3개가 있음 #config 는 IndexWriter 사용에 필요한 Analyzed 된 token이다. counter = Counter() generate_indices_from_projects(src, writer, counter) writer.close() print "Done" print str(counter) print "$$$%s\tseconds" % (time.time() - start_time) except CorruptIndexException as e: #when index is corrupt e.printStackTrace() except LockObtainFailedException as e: #when other writer is using the index e.printStackTrace() except IOException as e: #when directory can't be read/written e.printStackTrace()
def main(): try: print "Indexing starts..." indicesDestination = File("/Users/Falcon/Desktop/New_Indices/IJA_Indices") analyzer = KeywordAnalyzer() a = {"code": JavaCodeAnalyzer(), "comments": EnglishAnalyzer(Version.LUCENE_CURRENT)} wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a) config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer) writer = IndexWriter(SimpleFSDirectory(indicesDestination), config) counter = Counter() generate_indices_from_projects(writer, counter) writer.close() print "Done" print str(counter) except CorruptIndexException as e: #when index is corrupt e.printStackTrace() except LockObtainFailedException as e: #when other writer is using the index e.printStackTrace() except IOException as e: #when directory can't be read/written e.printStackTrace()
def load_index(self): indexDir = File(self.index_path) porter_analyzer = PorterAnalyzer( StandardAnalyzer(Version.LUCENE_CURRENT)) a = { "typed_method_call": KeywordAnalyzer(), "extends": KeywordAnalyzer(), "used_classes": KeywordAnalyzer(), "methods": KeywordAnalyzer(), "class_instance_creation": KeywordAnalyzer(), "id": KeywordAnalyzer(), "code": JavaCodeAnalyzer() } self.analyzer = PerFieldAnalyzerWrapper(porter_analyzer, a) index = SimpleFSDirectory(indexDir) self.reader = IndexReader.open( index) #IndexReader 열고 닫지 않았었음........................... n_docs = self.reader.numDocs() print("Index contains %d documents." % n_docs)
def load_index(self): indexDir = File(self.index_path) a = { "code": self.porter_analyzer, "description": self.porter_analyzer, "typed_method_call": KeywordAnalyzer(), "extends": KeywordAnalyzer(), "used_classes": KeywordAnalyzer(), "methods": KeywordAnalyzer(), "class_instance_creation": KeywordAnalyzer(), "id": KeywordAnalyzer() } self.analyzer = PerFieldAnalyzerWrapper(self.porter_analyzer, a) index = SimpleFSDirectory(indexDir) self.reader = IndexReader.open(index) n_docs = self.reader.numDocs() self.searcher = IndexSearcher(self.reader) print("\nLoading Indices... GitHub index contains [%d] documents." % n_docs)
def __recs_query(self, positive_rated_document_list, scores, recs_number, items_directory, candidate_list: List) -> pd.DataFrame: """ Builds a query using the contents that the user liked. The terms relative to the contents that the user liked are boosted by the rating he/she gave. A filter clause is added to the query to consider only candidate items Args: positive_rated_document_list: List of contents that the user liked scores: Ratings given by the user recs_number: How many items must be recommended. You can only specify the number, not a specific item for which compute the prediction items_directory: Directory where the items are stored Returns: score_frame (pd.DataFrame): DataFrame containing the recommendations for the user """ BooleanQuery.setMaxClauseCount(2000000) searcher = IndexSearcher( DirectoryReader.open(SimpleFSDirectory( Paths.get(items_directory)))) if self.__classic_similarity: searcher.setSimilarity(ClassicSimilarity()) field_list = searcher.doc(positive_rated_document_list[0]).getFields() user_fields = {} field_parsers = {} analyzer = SimpleAnalyzer() for field in field_list: if field.name() == 'content_id': continue user_fields[field.name()] = field.stringValue() field_parsers[field.name()] = QueryParser(field.name(), analyzer) positive_rated_document_list.remove(positive_rated_document_list[0]) for _ in positive_rated_document_list: for field in field_list: if field.name() == 'content_id': continue user_fields[field.name()] += field.stringValue() logger.info("Building query") query_builder = BooleanQuery.Builder() for score in scores: for field_name in user_fields.keys(): if field_name == 'content_id': continue field_parsers[field_name].setDefaultOperator( QueryParser.Operator.OR) field_query = field_parsers[field_name].escape( user_fields[field_name]) field_query = field_parsers[field_name].parse(field_query) field_query = BoostQuery(field_query, score) query_builder.add(field_query, BooleanClause.Occur.SHOULD) if candidate_list is not None: id_query_string = ' OR '.join("content_id:\"" + content_id + "\"" for content_id in candidate_list) id_query = QueryParser("testo_libero", KeywordAnalyzer()).parse(id_query_string) query_builder.add(id_query, BooleanClause.Occur.MUST) query = query_builder.build() docs_to_search = len(positive_rated_document_list) + recs_number scoreDocs = searcher.search(query, docs_to_search).scoreDocs logger.info("Building score frame to return") recorded_items = 0 columns = ['to_id', 'rating'] score_frame = pd.DataFrame(columns=columns) for scoreDoc in scoreDocs: if recorded_items >= recs_number: break if scoreDoc.doc not in positive_rated_document_list: doc = searcher.doc(scoreDoc.doc) item_id = doc.getField("content_id").stringValue() recorded_items += 1 score_frame = pd.concat([ score_frame, pd.DataFrame.from_records([(item_id, scoreDoc.score)], columns=columns) ]) return score_frame
default='8g', help="min ram for the VM") op.add_option("--max_n", dest='max_n', default=MAX_N, help="max return search item") opts, args = op.parse_args(sys.argv) lucene.initVM(maxheap=opts.maxheap) print('lucene', lucene.VERSION) start = datetime.now() if opts.exact_match: print("creating keyworkanalyzer -> exact match on %s" % DEFAULT_SEARCH_FIELD) analyzer = KeywordAnalyzer(Version.LUCENE_CURRENT) else: print("creating stdanalyzer -> keyword match on %s" % DEFAULT_SEARCH_FIELD) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) base_dir = os.path.dirname(os.path.abspath(sys.argv[0])) fname = os.path.join(base_dir, opts.index_dir) if opts.create_index: if len(sys.argv) < 2: sys.stderr.writelines("ERROR: need a directory to index\n") sys.exit(1) try: print("creating index:", fname) IndexFiles(sys.argv[1], fname, analyzer, not opts.all_line)
from org.apache.lucene.analysis.tokenattributes import CharTermAttribute from org.apache.lucene.queries.function.valuesource import LongFieldSource from org.apache.lucene.queries.function import FunctionQuery from org.apache.lucene.queries import CustomScoreQuery from com.mongodb import BasicDBObject from collections import Counter import utils indexDir = File("/tmp/github") # 1. open the index analyzer = KeywordAnalyzer() index = SimpleFSDirectory(indexDir) reader = IndexReader.open(index) n_docs = reader.numDocs() print("Index contains %d documents." % n_docs) # 2. parse the query from the command line # a = {"typed_method_call": WhitespaceAnalyzer()} # wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a) query_string = "HttpURLConnection.disconnect Exception.printStackTrace BufferedReader.close HttpURLConnection.setRequestProperty HttpURLConnection.setRequestMethod DataOutputStream.writeBytes HttpURLConnection.getInputStream DataOutputStream.close HttpURLConnection.setUseCaches StringBuffer.append URL.openConnection HttpURLConnection.getOutputStream Integer.toString String.getBytes StringBuffer.toString HttpURLConnection.setDoOutput BufferedReader.readLine DataOutputStream.flush HttpURLConnection.setDoInput" query_parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, ["typed_method_call"], analyzer) #base_query = getSpanNearQuery(analyzer, query_string)
# boilerplate for setting up spellchecking: from java.io import StringReader from org.apache.lucene.store import RAMDirectory from org.apache.lucene.index import IndexWriterConfig from org.apache.lucene.analysis.core import KeywordAnalyzer # Start JVM for Lucene. lucene.initVM() # Set up Lucene spellchecking. dict_reader = StringReader(dict_str) dictionary = PlainTextDictionary(dict_reader) ramdir = RAMDirectory() spellchecker = SpellChecker(ramdir) spellchecker.indexDictionary(dictionary, IndexWriterConfig(KeywordAnalyzer()), True) # Run the word correction test. def correct_word(word): candidates = spellchecker.suggestSimilar(word, 10) if len(candidates) > 0: return candidates[0] else: return '' good, bad = [], [] with open('Edit_distance_corrections_{}.tab'.format(EXPERIM_ID), 'w+') as corrs_file:
def main(index_dir, input_dir): """Creates a SQLite database with news linked to other news by at least one term, backed by a Lucene Index""" lucene.initVM() # Open index logger.info("Opening Lucene index [%s]..." % index_dir) fs_dir = SimpleFSDirectory(Paths.get(index_dir)) analyzer = KeywordAnalyzer() query_parser = QueryParser("title", analyzer) reader = DirectoryReader.open(fs_dir) searcher = IndexSearcher(reader) # Search documents onlyfiles = [ f for f in listdir(input_dir) if isfile(join(input_dir, f)) and f.endswith('.json') ] rels = list() for f in onlyfiles: journal_code = f.split('.')[0] f = join(input_dir, f) json_data = open(f) data = json.load(json_data) # The results collected after comparison for entry in data: url = entry['url'] date = entry['date'] title = entry['title'] logger.debug("Processing URL [%s] date [%s] - [%s]" % (url, date, title)) tt = nltk.word_tokenize(title) tokens = [] for t in tt: tokens.append(t.lower()) for token in tokens: q = 'title: "%s" AND date: "%s" AND NOT journal: "%s" AND NOT url: "%s"' % ( token, date, journal_code, url) try: query = query_parser.parse(q) except: continue hits = searcher.search(query, MAX_HITS) logger.debug("Found %d document(s) that matched query '%s':" % (hits.totalHits, query)) for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) logger.debug(doc) rels.append({ 'left': url, 'token': token, 'right': doc.get('url') }) json_data.close() with open('relationships.csv', 'w') as csvfile: csvwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL) for rel in rels: csvwriter.writerow([rel['left'], rel['token'], rel['right']])
apis.append(answer["typed_method_call"]) print apis indexDir = File("/tmp/stackoverflow") # 1. open the index analyzer = PorterAnalyzer(StandardAnalyzer(Version.LUCENE_CURRENT)) index = SimpleFSDirectory(indexDir) reader = IndexReader.open(index) n_docs = reader.numDocs() print("Index contains %d documents." % n_docs) # 2. parse the query from the command line a = {"typed_method_call": KeywordAnalyzer(), "extends": KeywordAnalyzer()} wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a) query_string = "lucene get similar documents to the current one" query_parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, ["title"], wrapper_analyzer) #base_query = getSpanNearQuery(analyzer, query_string) base_query = query_parser.parse(query_string) #http://shaierera.blogspot.com/2013/09/boosting-documents-in-lucene.html boost_query = FunctionQuery(LongFieldSource("view_count")) query = CustomScoreQuery(base_query, boost_query) # queryparser = QueryParser(Version.LUCENE_CURRENT, "title", analyzer)
def ancientSearch(self, field): sear = self._search fieldOnly = False # 只搜索域 if len(self._commandInfo.getWordList()) == 0: fieldOnly = True bq = BooleanQuery.Builder() fields = self._commandInfo.getFields() for key in fields: queryx = QueryParser(key, KeywordAnalyzer()).parse(fields[key][0]) bc = BooleanClause(queryx, BooleanClause.Occur.MUST) bq.add(bc) query = bq.build() elif len(self._commandInfo.getKey()) == 0 or self._commandInfo.getKey()[0] in ['-', '~']: bq = BooleanQuery.Builder() q = QueryParser(field, StandardAnalyzer()).parse(make_parser(self._commandInfo.getWordList()[0])) bc = BooleanClause(q, BooleanClause.Occur.MUST) bq.add(bc) for i in self._commandInfo.getFields(): if i == 'section' or i == 'document': continue queryx = QueryParser(i, KeywordAnalyzer()).parse(make_ancient_parser(self._commandInfo.getFields()[i])) bc = BooleanClause(queryx, BooleanClause.Occur.MUST) bq.add(bc) query = bq.build() elif self._commandInfo.getKey()[0] == '#': bq = BooleanQuery.Builder() query1 = QueryParser(field, StandardAnalyzer()).parse(make_parser(self._commandInfo.getWordList()[0])) query2 = QueryParser(field, StandardAnalyzer()).parse(make_parser(self._commandInfo.getWordList()[1])) bc1 = BooleanClause(query1, BooleanClause.Occur.MUST) bc2 = BooleanClause(query2, BooleanClause.Occur.MUST) bq.add(bc1).add(bc2) for i in self._commandInfo.getFields(): if i == 'section' or i == 'document': continue queryx = QueryParser(i, KeywordAnalyzer()).parse(make_ancient_parser(self._commandInfo.getFields()[i])) bc = BooleanClause(queryx, BooleanClause.Occur.MUST) bq.add(bc) query = bq.build() elif self._commandInfo.getKey()[0] in ['$', '+']: bq = BooleanQuery.Builder() for w in self._commandInfo.getWordList(): queryx = QueryParser(field, StandardAnalyzer()).parse(make_parser(w)) bc = BooleanClause(queryx, BooleanClause.Occur.MUST) bq.add(bc) for i in self._commandInfo.getFields(): if i == 'section' or i == 'document': continue queryx = QueryParser(i, KeywordAnalyzer()).parse(make_ancient_parser(self._commandInfo.getFields()[i])) bc = BooleanClause(queryx, BooleanClause.Occur.MUST) bq.add(bc) query = bq.build() else: query = '' hits = sear.search(query, 9999) for hit in hits.scoreDocs: doc = sear.doc(hit.doc) res = doc.get(field) id = doc.get('id') detail = get_detail(doc) zhujie = detail['zhujie'] if detail['detail'] and 'detail' in detail['detail'].keys(): detail['detail'] = detail['detail']['detail'] detail.pop('zhujie') detail.pop('text') detail.pop('type') detail = json.dumps(detail) if fieldOnly: if not doc.get("text").strip(): continue if id.count(".") == 2: self._doc[id] = doc.get("text") self._resultSentencesList.append((id, doc.get("text"))) elif id.count(".") == 1: searcher = self._search query = QueryParser('id', KeywordAnalyzer()).parse(id + '.1') hits = searcher.search(query, 1) for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) res = doc.get("text") if res: self._doc[id+".1"] = doc.get('text') self._resultSentencesList.append((id + ".1", doc.get('text'))) else: searcher = self._search query = QueryParser('id', KeywordAnalyzer()).parse(id + '.1.1') hits = searcher.search(query, 1) for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) res = doc.get("text") if not doc.get("text").strip(): continue if res: self._doc[id+".1.1"] = doc.get('text') self._resultSentencesList.append((id + ".1.1", doc.get('text'))) elif doc_hit(res, self._commandInfo): if key_filter(self._commandInfo, res): if 'section' in self._commandInfo.getFields().keys(): if not search_upper_title_filter(id, sear, self._commandInfo.getFields()['section'], 0): continue if 'document' in self._commandInfo.getFields().keys(): if not search_upper_title_filter(id, sear, self._commandInfo.getFields()['document'], 1): continue self._doc[id] = res self._resultSentencesList.append((id, res, detail, zhujie)) return self