def main():
    try:
        indicesDestination = File(dest_path)
        analyzer = KeywordAnalyzer()
        porter_analyzer = PorterAnalyzer(
            StandardAnalyzer(Version.LUCENE_CURRENT))
        a = {
            "code": porter_analyzer,
            "description": porter_analyzer,
            "typed_method_call": KeywordAnalyzer(),
            "extends": KeywordAnalyzer(),
            "used_classes": KeywordAnalyzer(),
            "methods": KeywordAnalyzer(),
            "class_instance_creation": KeywordAnalyzer(),
            "id": KeywordAnalyzer(),
            "literals": porter_analyzer,
            "word": KeywordAnalyzer()
        }
        wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a)
        config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer)

        writer = IndexWriter(SimpleFSDirectory(indicesDestination), config)
        counter = Counter()
        generate_indices_from_benchmark(writer, counter)
        writer.close()

        print "All jobs are done.."
        print str(counter)

    except CorruptIndexException as e:  #when index is corrupt
        e.printStackTrace()
    except LockObtainFailedException as e:  #when other writer is using the index
        e.printStackTrace()
    except IOException as e:  #when directory can't be read/written
        e.printStackTrace()
示例#2
0
def main():
    INDEX_DIR = "indexes"
    try:
        print "Indexing..."
        indexDir = File("/home/ubuntu/Desktop/CoCaBu_remote/GitSearch/Indices")

        #writer = IndexWriter(SimpleFSDirectory(indexDir), StandardAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED)
        analyzer = KeywordAnalyzer(
        )  #PorterAnalyzer( StandardAnalyzer(Version.LUCENE_CURRENT))
        a = {
            "code": JavaCodeAnalyzer(),
            "comments": EnglishAnalyzer(Version.LUCENE_CURRENT)
        }
        wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a)
        config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer)
        writer = IndexWriter(SimpleFSDirectory(indexDir), config)

        index_code_snippet(writer)

        writer.close()
    except CorruptIndexException as e:  #when index is corrupt
        e.printStackTrace()
    except LockObtainFailedException as e:  #when other writer is using the index
        e.printStackTrace()
    except IOException as e:  #when directory can't be read/written
        e.printStackTrace()
示例#3
0
def main():
    try:
        print "Indexing starts..."
        # indicesDestination = File("/Users/Falcon/Desktop/dyclink_2014")############################################

        indicesDestination = File("/Indices/dyclink/2014")

        analyzer = KeywordAnalyzer(
        )  #전체 텍스트를 하나의 토큰으로 다룬다. (즉, Analyze 하지 않는 것과 결과적으로 동일하다.)
        a = {
            "code": JavaCodeAnalyzer(),
            "comments": EnglishAnalyzer(Version.LUCENE_CURRENT)
        }  #PerFieldAnalyzerWrapper를 사용하기 위한 map 생성 (Python 에서는 Dict())
        wrapper_analyzer = PerFieldAnalyzerWrapper(
            analyzer, a
        )  #http://svn.apache.org/viewvc/lucene/pylucene/trunk/test/test_PerFieldAnalyzerWrapper.py?revision=1757704&view=co
        config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer)

        writer = IndexWriter(SimpleFSDirectory(indicesDestination), config)

        counter = Counter()
        generate_indices_from_projects(writer, counter)
        writer.close()

        print "Done"
        print str(counter)

    except CorruptIndexException as e:  #when index is corrupt
        e.printStackTrace()
    except LockObtainFailedException as e:  #when other writer is using the index
        e.printStackTrace()
    except IOException as e:  #when directory can't be read/written
        e.printStackTrace()
示例#4
0
def main():
	try:
		print "Indexing..."
		indexDestination = File("/Users/Falcon/Desktop/New_Indices/Stack_Q_Indices")
		# writer = IndexWriter(SimpleFSDirectory(indexDestination), StandardAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED)
		analyzer = PorterAnalyzer(StandardAnalyzer(Version.LUCENE_CURRENT))
		a = {"typed_method_call": KeywordAnalyzer(), "extends": KeywordAnalyzer(),
			 "used_classes": KeywordAnalyzer(), "methods": KeywordAnalyzer(),
			 "class_instance_creation": KeywordAnalyzer(), "methods_called": KeywordAnalyzer(),
			 "view_count": KeywordAnalyzer(), "code_hints": JavaCodeAnalyzer()}
		#KeywordAnalyzer : 필드의 전체 원문을 하나의 토큰으로 처리
		wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a)
		#PerFieldAnalyzerWrapper : 필드별로 분석기를 지정하는 기능을 지원하는 클래스
		config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer)
		config.setInfoStream(System.out)  # 루씬 색인작업 디버깅 // 루크라는 도구를 사용해서 루씬 색인 관리를 할 수도 있음..
		writer = IndexWriter(SimpleFSDirectory(indexDestination), config)

		counter = Counter()
		index_code_snippet(writer, counter)
		writer.commit()

		writer.close()
		print "Done"
		print str(counter)

	except CorruptIndexException as e:  # when index is corrupt
		e.printStackTrace()
	except LockObtainFailedException as e:  # when other writer is using the index
		e.printStackTrace()
	except IOException as e:  # when directory can't be read/written
		e.printStackTrace()
	except SQLException as e:  # when Database error occurs
		e.printStackTrace()
示例#5
0
def main():
    INDEX_DIR = "indexes"
    try:
        print "Indexing..."
        indexDir = File("/Users/Raphael/Downloads/stackoverflow1107")

        #writer = IndexWriter(SimpleFSDirectory(indexDir), StandardAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED)
        analyzer = PorterAnalyzer(StandardAnalyzer(Version.LUCENE_CURRENT))
        a = {
            "typed_method_call": KeywordAnalyzer(),
            "extends": KeywordAnalyzer(),
            "used_classes": KeywordAnalyzer(),
            "methods": KeywordAnalyzer(),
            "class_instance_creation": KeywordAnalyzer(),
            "methods_called": KeywordAnalyzer(),
            "view_count": KeywordAnalyzer(),
            "code_hints": JavaCodeAnalyzer()
        }
        wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a)
        config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer)
        writer = IndexWriter(SimpleFSDirectory(indexDir), config)

        index_code_snippet(writer)

        writer.commit()
        writer.close()
        print "Done"
    except CorruptIndexException as e:  #when index is corrupt
        e.printStackTrace()
    except LockObtainFailedException as e:  #when other writer is using the index
        e.printStackTrace()
    except IOException as e:  #when directory can't be read/written
        e.printStackTrace()
    except SQLException as e:  #when Database error occurs
        e.printStackTrace()
示例#6
0
 def load_index(self):
     indexDir = File(self.index_path)
     a = {"code": self.porter_analyzer}
     self.analyzer = PerFieldAnalyzerWrapper(KeywordAnalyzer(), a)
     index = SimpleFSDirectory(indexDir)
     self.reader = IndexReader.open(index)
     n_docs = self.reader.numDocs()
     self.searcher = IndexSearcher(self.reader)
     print("Index contains %d documents." % n_docs)
示例#7
0
 def __init__(self, source, index_path):
     self.index_path = index_path
     self.source = source
     ast, source = parse(self.source, resolve=True, source=True)
     self.source = source
     self.ast = ast
     self.queryparser = QueryParser(Version.LUCENE_CURRENT,
                                    "typed_method_call", KeywordAnalyzer())
     self.load_index()
 def __init__(self, host, user, passwd, name):
     self.conn = mysql.connect(host, user, passwd, name, charset='utf8')
     self.cursor = self.conn.cursor()
     STORE_DIR = "fp_index"
     lucene.initVM(vmargs=['-Djava.awt.headless=true'])
     print 'lucene', lucene.VERSION
     # base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
     directory = SimpleFSDirectory(File(STORE_DIR))
     self.searcher = IndexSearcher(DirectoryReader.open(directory))
     self.analyzer = KeywordAnalyzer()
示例#9
0
	def load_index(self):
		a = {"code": self.porter_analyzer, "description": self.porter_analyzer, "typed_method_call": KeywordAnalyzer(),
			 "extends": KeywordAnalyzer(), "used_classes": KeywordAnalyzer(), "methods": KeywordAnalyzer(),
			 "class_instance_creation": KeywordAnalyzer(), "id": KeywordAnalyzer(), "literals": self.porter_analyzer}
		self.analyzer = PerFieldAnalyzerWrapper(KeywordAnalyzer(), a)
		self.directory = SimpleFSDirectory(self.index_path)
		self.reader = DirectoryReader.open(self.directory)
		self.searcher = IndexSearcher(self.reader)
示例#10
0
 def load_index(self):
     a = {
         "code": self.porter_analyzer,
         "description": self.porter_analyzer,
         "typed_method_call": KeywordAnalyzer(),
         "extends": KeywordAnalyzer(),
         "used_classes": KeywordAnalyzer(),
         "methods": KeywordAnalyzer(),
         "class_instance_creation": KeywordAnalyzer(),
         "id": KeywordAnalyzer(),
         "literals": self.porter_analyzer
     }
     self.analyzer = PerFieldAnalyzerWrapper(KeywordAnalyzer(), a)
示例#11
0
def main():
    try:
        print "Indexing..."
        #########################################  경   로  ####################################
        indexDestination = File(
            "/Users/Falcon/Desktop/New_Indices/Stack_A_Indices")

        #writer = IndexWriter(SimpleFSDirectory(indexDestination), StandardAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED)
        analyzer = PorterAnalyzer(StandardAnalyzer(Version.LUCENE_CURRENT))
        a = {
            "typed_method_call": analyzer,
            "extends": analyzer,
            "used_classes": analyzer,
            "methods": analyzer,
            "class_instance_creation": analyzer,
            "methods_called": analyzer,
            "view_count": KeywordAnalyzer(),
            "code_hints": JavaCodeAnalyzer()
        }
        wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a)
        config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer)
        writer = IndexWriter(SimpleFSDirectory(indexDestination), config)

        # analyzer = PorterAnalyzer(StandardAnalyzer(Version.LUCENE_CURRENT))
        # a = {"typed_method_call": KeywordAnalyzer(), "extends": KeywordAnalyzer(),
        # 	 "used_classes": KeywordAnalyzer(), "methods": KeywordAnalyzer(),
        # 	 "class_instance_creation": KeywordAnalyzer(), "methods_called": KeywordAnalyzer(),
        # 	 "view_count": KeywordAnalyzer(), "code_hints": JavaCodeAnalyzer()}
        # wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a)
        # config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer)
        # writer = IndexWriter(SimpleFSDirectory(indexDestination), config)

        counter = Counter()
        index_code_snippet(writer, counter)
        writer.commit()
        writer.close()

        print "Done"
        print str(counter)

    except CorruptIndexException as e:  #when index is corrupt
        e.printStackTrace()
    except LockObtainFailedException as e:  #when other writer is using the index
        e.printStackTrace()
    except IOException as e:  #when directory can't be read/written
        e.printStackTrace()
    except SQLException as e:  #when Database error occurs
        e.printStackTrace()
示例#12
0
def search_upper_title_filter(id: str, seacher: IndexSearcher, titleFields: list, type: int):
    sear = seacher
    if type == 0:
        upper_id = id[0:id.rfind('.')]
    else:
        upper_id = id[0:id.find('.')]
    query = QueryParser('id', KeywordAnalyzer()).parse(upper_id)
    hits = sear.search(query, 1)
    res = ''
    for hit in hits.scoreDocs:
        doc = sear.doc(hit.doc)
        if type == 0:
            res = doc.get('section')
        else:
            res = doc.get('document')
    if res in titleFields:
        return True
    return False
示例#13
0
 def searchAncient(self, field):
     indexDir = SimpleFSDirectory(Paths.get(self._dir))
     sear = IndexSearcher(DirectoryReader.open(indexDir))
     bq = BooleanQuery.Builder()
     q = QueryParser(field, StandardAnalyzer()).parse(make_parser(self._searchWord))
     bc = BooleanClause(q, BooleanClause.Occur.MUST)
     bq.add(bc)
     search_fields = self._fields
     for i in search_fields:
         if i == 'section' or i == 'document':
             continue
         queryx = QueryParser(i, KeywordAnalyzer()).parse(make_ancient_parser(search_fields[i]))
         bc = BooleanClause(queryx, BooleanClause.Occur.MUST)
         bq.add(bc)
     query = bq.build()
     hits = sear.search(query, 9999)
     for hit in hits.scoreDocs:
         doc = sear.doc(hit.doc)
         res = doc.get(field)
         id = doc.get('id')
         detail = get_detail(doc)
         zhujie = detail['zhujie']
         if detail['detail'] and 'detail' in detail['detail'].keys():
             detail['detail'] = detail['detail']['detail']
         detail.pop('zhujie')
         detail.pop('text')
         detail.pop('type')
         detail = json.dumps(detail)
         self._doc[id] = res
         if doc_hit(res, self._words):
             f = key_filter(self._words, self._re, res)
             if f:
                 if 'section' in search_fields.keys():
                     if not search_upper_title_filter(id, sear, search_fields['section'], 0):
                         continue
                 if 'document' in search_fields.keys():
                     if not search_upper_title_filter(id, sear, search_fields['document'], 1):
                         continue
                 self._match.append(f)
                 self._resultSentencesList.append((id, res, detail, zhujie))
                 print(res)
                 print(self._match)
     return self
示例#14
0
    def load_index(self):
        a = {
            "code": self.porter_analyzer,
            "description": self.porter_analyzer,
            "typed_method_call": KeywordAnalyzer(),
            "extends": KeywordAnalyzer(),
            "used_classes": KeywordAnalyzer(),
            "methods": KeywordAnalyzer(),
            "class_instance_creation": KeywordAnalyzer(),
            "id": KeywordAnalyzer(),
            "literals": self.porter_analyzer
        }
        self.analyzer = PerFieldAnalyzerWrapper(KeywordAnalyzer(), a)
        self.directory = SimpleFSDirectory(self.index_path)

        self.searchermgr = SearcherManager(self.directory, SearcherFactory())
        self.searchermgr.maybeRefresh()
        self.searcher = self.searchermgr.acquire()
示例#15
0
    def get_tf_idf(self, field_name: str, content_id: str):
        """
        Calculates the tf-idf for the words contained in the field of the content whose id
        is content_id

        Args:
            field_name (str): Name of the field containing the words for which calculate the tf-idf
            content_id (str): Id of the content that contains the specified field

        Returns:
             words_bag (Dict <str, float>):
             Dictionary whose keys are the words contained in the field,
             and the corresponding values are the tf-idf values.
        """
        searcher = IndexSearcher(
            DirectoryReader.open(SimpleFSDirectory(Paths.get(self.directory))))
        query = QueryParser("testo_libero",
                            KeywordAnalyzer()).parse("content_id:\"" +
                                                     content_id + "\"")
        score_docs = searcher.search(query, 1).scoreDocs
        document_offset = -1
        for score_doc in score_docs:
            document_offset = score_doc.doc

        reader = searcher.getIndexReader()
        words_bag = {}
        term_vector = reader.getTermVector(document_offset, field_name)
        term_enum = term_vector.iterator()
        for term in BytesRefIterator.cast_(term_enum):
            term_text = term.utf8ToString()
            postings = term_enum.postings(None)
            postings.nextDoc()
            term_frequency = 1 + math.log10(
                postings.freq())  # normalized term frequency
            inverse_document_frequency = math.log10(
                reader.maxDoc() / reader.docFreq(Term(field_name, term)))
            tf_idf = term_frequency * inverse_document_frequency
            words_bag[term_text] = tf_idf

        reader.close()
        return words_bag
示例#16
0
def main(src, dst):
    try:
        start_time = time.time()

        print "Indexing starts..."
        indicesDestination = File(dst)
        #writer = IndexWriter(SimpleFSDirectory(indexDestination), StandardAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED)
        #Analyzer : 본문이나 제목 등의 텍스트를 색인하기 전에 반드시 분석기를 거쳐 단어로 분리해야 한다. Analyzer 클래스는 Directory와 함께 IndexWrite 클래스의 생성 메소드에 지정하며 지정된 텍슽트를 색인할 단위 단어로 분리하고 필요 없는 단어를 제거하는 등의 역할을 담당

        analyzer = KeywordAnalyzer(
        )  #전체 텍스트를 하나의 토큰으로 다룬다. (즉, Analyze 하지 않는 것과 결과적으로 동일하다.)
        a = {
            "code": JavaCodeAnalyzer(),
            "comments": EnglishAnalyzer(Version.LUCENE_CURRENT)
        }  #PerFieldAnalyzerWrapper를 사용하기 위한 map 생성 (Python 에서는 Dict())
        wrapper_analyzer = PerFieldAnalyzerWrapper(
            analyzer, a
        )  #http://svn.apache.org/viewvc/lucene/pylucene/trunk/test/test_PerFieldAnalyzerWrapper.py?revision=1757704&view=co
        config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer)

        writer = IndexWriter(SimpleFSDirectory(indicesDestination), config)
        #SimpleFSDirectory 옵션은 파일시스템에 특정 디렉토리에 인덱스 파일을 저장하겠다. DB, RAM, File system 3개가 있음
        #config 는 IndexWriter 사용에 필요한 Analyzed 된 token이다.

        counter = Counter()
        generate_indices_from_projects(src, writer, counter)
        writer.close()
        print "Done"
        print str(counter)
        print "$$$%s\tseconds" % (time.time() - start_time)

    except CorruptIndexException as e:  #when index is corrupt
        e.printStackTrace()
    except LockObtainFailedException as e:  #when other writer is using the index
        e.printStackTrace()
    except IOException as e:  #when directory can't be read/written
        e.printStackTrace()
示例#17
0
def main():
	try:
		print "Indexing starts..."
		indicesDestination = File("/Users/Falcon/Desktop/New_Indices/IJA_Indices")

		analyzer = KeywordAnalyzer()  
		a = {"code": JavaCodeAnalyzer(), "comments": EnglishAnalyzer(Version.LUCENE_CURRENT)}
		wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a) 				
		config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer)

		writer = IndexWriter(SimpleFSDirectory(indicesDestination), config)
		counter = Counter()
		generate_indices_from_projects(writer, counter)
		writer.close()

		print "Done"
		print str(counter)

	except CorruptIndexException as e:		#when index is corrupt
			e.printStackTrace()
	except LockObtainFailedException as e:	#when other writer is using the index
			e.printStackTrace()
	except IOException as e:	#when directory can't be read/written
			e.printStackTrace()
示例#18
0
    def load_index(self):
        indexDir = File(self.index_path)
        porter_analyzer = PorterAnalyzer(
            StandardAnalyzer(Version.LUCENE_CURRENT))
        a = {
            "typed_method_call": KeywordAnalyzer(),
            "extends": KeywordAnalyzer(),
            "used_classes": KeywordAnalyzer(),
            "methods": KeywordAnalyzer(),
            "class_instance_creation": KeywordAnalyzer(),
            "id": KeywordAnalyzer(),
            "code": JavaCodeAnalyzer()
        }

        self.analyzer = PerFieldAnalyzerWrapper(porter_analyzer, a)
        index = SimpleFSDirectory(indexDir)
        self.reader = IndexReader.open(
            index)  #IndexReader 열고 닫지 않았었음...........................
        n_docs = self.reader.numDocs()
        print("Index contains %d documents." % n_docs)
示例#19
0
    def load_index(self):
        indexDir = File(self.index_path)
        a = {
            "code": self.porter_analyzer,
            "description": self.porter_analyzer,
            "typed_method_call": KeywordAnalyzer(),
            "extends": KeywordAnalyzer(),
            "used_classes": KeywordAnalyzer(),
            "methods": KeywordAnalyzer(),
            "class_instance_creation": KeywordAnalyzer(),
            "id": KeywordAnalyzer()
        }
        self.analyzer = PerFieldAnalyzerWrapper(self.porter_analyzer, a)

        index = SimpleFSDirectory(indexDir)
        self.reader = IndexReader.open(index)
        n_docs = self.reader.numDocs()
        self.searcher = IndexSearcher(self.reader)
        print("\nLoading Indices... GitHub index contains [%d] documents." %
              n_docs)
示例#20
0
    def __recs_query(self, positive_rated_document_list, scores, recs_number,
                     items_directory, candidate_list: List) -> pd.DataFrame:
        """
        Builds a query using the contents that the user liked. The terms relative to the contents that
        the user liked are boosted by the rating he/she gave. A filter clause is added to the query to
        consider only candidate items
        Args:
            positive_rated_document_list: List of contents that the user liked
            scores: Ratings given by the user
            recs_number: How many items must be recommended. You can only specify the number, not
            a specific item for which compute the prediction
            items_directory: Directory where the items are stored

        Returns:
            score_frame (pd.DataFrame): DataFrame containing the recommendations for the user
        """
        BooleanQuery.setMaxClauseCount(2000000)
        searcher = IndexSearcher(
            DirectoryReader.open(SimpleFSDirectory(
                Paths.get(items_directory))))
        if self.__classic_similarity:
            searcher.setSimilarity(ClassicSimilarity())

        field_list = searcher.doc(positive_rated_document_list[0]).getFields()
        user_fields = {}
        field_parsers = {}
        analyzer = SimpleAnalyzer()
        for field in field_list:
            if field.name() == 'content_id':
                continue
            user_fields[field.name()] = field.stringValue()
            field_parsers[field.name()] = QueryParser(field.name(), analyzer)

        positive_rated_document_list.remove(positive_rated_document_list[0])

        for _ in positive_rated_document_list:
            for field in field_list:
                if field.name() == 'content_id':
                    continue
                user_fields[field.name()] += field.stringValue()

        logger.info("Building query")

        query_builder = BooleanQuery.Builder()
        for score in scores:
            for field_name in user_fields.keys():
                if field_name == 'content_id':
                    continue
                field_parsers[field_name].setDefaultOperator(
                    QueryParser.Operator.OR)

                field_query = field_parsers[field_name].escape(
                    user_fields[field_name])
                field_query = field_parsers[field_name].parse(field_query)
                field_query = BoostQuery(field_query, score)
                query_builder.add(field_query, BooleanClause.Occur.SHOULD)

        if candidate_list is not None:
            id_query_string = ' OR '.join("content_id:\"" + content_id + "\""
                                          for content_id in candidate_list)
            id_query = QueryParser("testo_libero",
                                   KeywordAnalyzer()).parse(id_query_string)
            query_builder.add(id_query, BooleanClause.Occur.MUST)

        query = query_builder.build()
        docs_to_search = len(positive_rated_document_list) + recs_number
        scoreDocs = searcher.search(query, docs_to_search).scoreDocs

        logger.info("Building score frame to return")

        recorded_items = 0
        columns = ['to_id', 'rating']
        score_frame = pd.DataFrame(columns=columns)
        for scoreDoc in scoreDocs:
            if recorded_items >= recs_number:
                break
            if scoreDoc.doc not in positive_rated_document_list:
                doc = searcher.doc(scoreDoc.doc)
                item_id = doc.getField("content_id").stringValue()
                recorded_items += 1

                score_frame = pd.concat([
                    score_frame,
                    pd.DataFrame.from_records([(item_id, scoreDoc.score)],
                                              columns=columns)
                ])

        return score_frame
示例#21
0
文件: index.py 项目: fraka6/mlboost
                  default='8g',
                  help="min ram for the VM")
    op.add_option("--max_n",
                  dest='max_n',
                  default=MAX_N,
                  help="max return search item")
    opts, args = op.parse_args(sys.argv)

    lucene.initVM(maxheap=opts.maxheap)
    print('lucene', lucene.VERSION)
    start = datetime.now()

    if opts.exact_match:
        print("creating keyworkanalyzer -> exact match on %s" %
              DEFAULT_SEARCH_FIELD)
        analyzer = KeywordAnalyzer(Version.LUCENE_CURRENT)
    else:
        print("creating stdanalyzer -> keyword match on %s" %
              DEFAULT_SEARCH_FIELD)
        analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
    fname = os.path.join(base_dir, opts.index_dir)

    if opts.create_index:
        if len(sys.argv) < 2:
            sys.stderr.writelines("ERROR: need a directory to index\n")
            sys.exit(1)
        try:
            print("creating index:", fname)
            IndexFiles(sys.argv[1], fname, analyzer, not opts.all_line)
示例#22
0
from org.apache.lucene.analysis.tokenattributes import CharTermAttribute

from org.apache.lucene.queries.function.valuesource import LongFieldSource
from org.apache.lucene.queries.function import FunctionQuery
from org.apache.lucene.queries import CustomScoreQuery

from com.mongodb import BasicDBObject

from collections import Counter

import utils

indexDir = File("/tmp/github")

# 1. open the index
analyzer = KeywordAnalyzer()
index = SimpleFSDirectory(indexDir)
reader = IndexReader.open(index)
n_docs = reader.numDocs()
print("Index contains %d documents." % n_docs)

# 2. parse the query from the command line
# a = {"typed_method_call": WhitespaceAnalyzer()}
# wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a)

query_string = "HttpURLConnection.disconnect Exception.printStackTrace BufferedReader.close HttpURLConnection.setRequestProperty HttpURLConnection.setRequestMethod DataOutputStream.writeBytes HttpURLConnection.getInputStream DataOutputStream.close HttpURLConnection.setUseCaches StringBuffer.append URL.openConnection HttpURLConnection.getOutputStream Integer.toString String.getBytes StringBuffer.toString HttpURLConnection.setDoOutput BufferedReader.readLine DataOutputStream.flush HttpURLConnection.setDoInput"
query_parser = MultiFieldQueryParser(Version.LUCENE_CURRENT,
                                     ["typed_method_call"], analyzer)

#base_query = getSpanNearQuery(analyzer, query_string)
示例#23
0
# boilerplate for setting up spellchecking:
from java.io import StringReader
from org.apache.lucene.store import RAMDirectory
from org.apache.lucene.index import IndexWriterConfig
from org.apache.lucene.analysis.core import KeywordAnalyzer

# Start JVM for Lucene.
lucene.initVM()

# Set up Lucene spellchecking.
dict_reader = StringReader(dict_str)
dictionary = PlainTextDictionary(dict_reader)

ramdir = RAMDirectory()
spellchecker = SpellChecker(ramdir)
spellchecker.indexDictionary(dictionary, IndexWriterConfig(KeywordAnalyzer()),
                             True)


# Run the word correction test.
def correct_word(word):
    candidates = spellchecker.suggestSimilar(word, 10)
    if len(candidates) > 0:
        return candidates[0]
    else:
        return ''


good, bad = [], []
with open('Edit_distance_corrections_{}.tab'.format(EXPERIM_ID),
          'w+') as corrs_file:
示例#24
0
def main(index_dir, input_dir):
    """Creates a SQLite database with news linked to other news by at least one term, backed by a Lucene Index"""
    lucene.initVM()

    # Open index
    logger.info("Opening Lucene index [%s]..." % index_dir)
    fs_dir = SimpleFSDirectory(Paths.get(index_dir))
    analyzer = KeywordAnalyzer()
    query_parser = QueryParser("title", analyzer)
    reader = DirectoryReader.open(fs_dir)
    searcher = IndexSearcher(reader)

    # Search documents
    onlyfiles = [
        f for f in listdir(input_dir)
        if isfile(join(input_dir, f)) and f.endswith('.json')
    ]
    rels = list()
    for f in onlyfiles:
        journal_code = f.split('.')[0]
        f = join(input_dir, f)
        json_data = open(f)
        data = json.load(json_data)
        # The results collected after comparison

        for entry in data:
            url = entry['url']
            date = entry['date']
            title = entry['title']

            logger.debug("Processing URL [%s] date [%s] - [%s]" %
                         (url, date, title))

            tt = nltk.word_tokenize(title)
            tokens = []
            for t in tt:
                tokens.append(t.lower())

            for token in tokens:
                q = 'title: "%s" AND date: "%s" AND NOT journal: "%s" AND NOT url: "%s"' % (
                    token, date, journal_code, url)
                try:
                    query = query_parser.parse(q)
                except:
                    continue
                hits = searcher.search(query, MAX_HITS)

                logger.debug("Found %d document(s) that matched query '%s':" %
                             (hits.totalHits, query))

                for hit in hits.scoreDocs:
                    doc = searcher.doc(hit.doc)
                    logger.debug(doc)

                    rels.append({
                        'left': url,
                        'token': token,
                        'right': doc.get('url')
                    })
        json_data.close()

    with open('relationships.csv', 'w') as csvfile:
        csvwriter = csv.writer(csvfile,
                               delimiter=',',
                               quotechar='"',
                               quoting=csv.QUOTE_ALL)
        for rel in rels:
            csvwriter.writerow([rel['left'], rel['token'], rel['right']])
示例#25
0
        apis.append(answer["typed_method_call"])

    print apis


indexDir = File("/tmp/stackoverflow")

# 1. open the index
analyzer = PorterAnalyzer(StandardAnalyzer(Version.LUCENE_CURRENT))
index = SimpleFSDirectory(indexDir)
reader = IndexReader.open(index)
n_docs = reader.numDocs()
print("Index contains %d documents." % n_docs)

# 2. parse the query from the command line
a = {"typed_method_call": KeywordAnalyzer(), "extends": KeywordAnalyzer()}
wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a)

query_string = "lucene get similar documents to the current one"
query_parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, ["title"],
                                     wrapper_analyzer)

#base_query = getSpanNearQuery(analyzer, query_string)

base_query = query_parser.parse(query_string)

#http://shaierera.blogspot.com/2013/09/boosting-documents-in-lucene.html
boost_query = FunctionQuery(LongFieldSource("view_count"))
query = CustomScoreQuery(base_query, boost_query)

# queryparser = QueryParser(Version.LUCENE_CURRENT, "title", analyzer)
示例#26
0
    def ancientSearch(self, field):
        sear = self._search
        fieldOnly = False
        # 只搜索域
        if len(self._commandInfo.getWordList()) == 0:
            fieldOnly = True
            bq = BooleanQuery.Builder()
            fields = self._commandInfo.getFields()
            for key in fields:
                queryx = QueryParser(key, KeywordAnalyzer()).parse(fields[key][0])
                bc = BooleanClause(queryx, BooleanClause.Occur.MUST)
                bq.add(bc)
            query = bq.build()

        elif len(self._commandInfo.getKey()) == 0 or self._commandInfo.getKey()[0] in ['-', '~']:
            bq = BooleanQuery.Builder()
            q = QueryParser(field, StandardAnalyzer()).parse(make_parser(self._commandInfo.getWordList()[0]))
            bc = BooleanClause(q, BooleanClause.Occur.MUST)
            bq.add(bc)
            for i in self._commandInfo.getFields():
                if i == 'section' or i == 'document':
                    continue
                queryx = QueryParser(i, KeywordAnalyzer()).parse(make_ancient_parser(self._commandInfo.getFields()[i]))
                bc = BooleanClause(queryx, BooleanClause.Occur.MUST)
                bq.add(bc)
            query = bq.build()
        elif self._commandInfo.getKey()[0] == '#':
            bq = BooleanQuery.Builder()
            query1 = QueryParser(field, StandardAnalyzer()).parse(make_parser(self._commandInfo.getWordList()[0]))
            query2 = QueryParser(field, StandardAnalyzer()).parse(make_parser(self._commandInfo.getWordList()[1]))
            bc1 = BooleanClause(query1, BooleanClause.Occur.MUST)
            bc2 = BooleanClause(query2, BooleanClause.Occur.MUST)
            bq.add(bc1).add(bc2)
            for i in self._commandInfo.getFields():
                if i == 'section' or i == 'document':
                    continue
                queryx = QueryParser(i, KeywordAnalyzer()).parse(make_ancient_parser(self._commandInfo.getFields()[i]))
                bc = BooleanClause(queryx, BooleanClause.Occur.MUST)
                bq.add(bc)
            query = bq.build()
        elif self._commandInfo.getKey()[0] in ['$', '+']:
            bq = BooleanQuery.Builder()
            for w in self._commandInfo.getWordList():
                queryx = QueryParser(field, StandardAnalyzer()).parse(make_parser(w))
                bc = BooleanClause(queryx, BooleanClause.Occur.MUST)
                bq.add(bc)
            for i in self._commandInfo.getFields():
                if i == 'section' or i == 'document':
                    continue
                queryx = QueryParser(i, KeywordAnalyzer()).parse(make_ancient_parser(self._commandInfo.getFields()[i]))
                bc = BooleanClause(queryx, BooleanClause.Occur.MUST)
                bq.add(bc)
            query = bq.build()
        else:
            query = ''
        hits = sear.search(query, 9999)
        for hit in hits.scoreDocs:
            doc = sear.doc(hit.doc)
            res = doc.get(field)
            id = doc.get('id')
            detail = get_detail(doc)
            zhujie = detail['zhujie']
            if detail['detail'] and 'detail' in detail['detail'].keys():
                detail['detail'] = detail['detail']['detail']
            detail.pop('zhujie')
            detail.pop('text')
            detail.pop('type')
            detail = json.dumps(detail)
            if fieldOnly:
                if not doc.get("text").strip():
                    continue
                if id.count(".") == 2:
                    self._doc[id] = doc.get("text")
                    self._resultSentencesList.append((id, doc.get("text")))
                elif id.count(".") == 1:
                    searcher = self._search
                    query = QueryParser('id', KeywordAnalyzer()).parse(id + '.1')
                    hits = searcher.search(query, 1)

                    for hit in hits.scoreDocs:
                        doc = searcher.doc(hit.doc)
                        res = doc.get("text")
                        if res:
                            self._doc[id+".1"] = doc.get('text')
                            self._resultSentencesList.append((id + ".1", doc.get('text')))
                else:
                    searcher = self._search
                    query = QueryParser('id', KeywordAnalyzer()).parse(id + '.1.1')
                    hits = searcher.search(query, 1)
                    for hit in hits.scoreDocs:
                        doc = searcher.doc(hit.doc)
                        res = doc.get("text")
                        if not doc.get("text").strip():
                            continue
                        if res:
                            self._doc[id+".1.1"] = doc.get('text')
                            self._resultSentencesList.append((id + ".1.1", doc.get('text')))
            elif doc_hit(res, self._commandInfo):
                if key_filter(self._commandInfo, res):
                    if 'section' in self._commandInfo.getFields().keys():
                        if not search_upper_title_filter(id, sear, self._commandInfo.getFields()['section'], 0):
                            continue
                    if 'document' in self._commandInfo.getFields().keys():
                        if not search_upper_title_filter(id, sear, self._commandInfo.getFields()['document'], 1):
                            continue
                    self._doc[id] = res
                    self._resultSentencesList.append((id, res, detail, zhujie))
        return self