def lucene_english_normalizer(text):
    # function for normalizing text scraped from wikipedia
    reader = StringReader(text)
    filter = HTMLStripCharFilter(reader)
    normalizer = EnglishAnalyzer(Version.LUCENE_4_10_1)
    english_normalizer = normalizer.tokenStream('field',filter)
    english_normalizer.reset()
    lemmas = []
    while(english_normalizer.incrementToken()):
        token = english_normalizer.getAttribute(tokenattributes.CharTermAttribute.class_).toString()
        lemmas.append(token)
    return(lemmas)
Exemplo n.º 2
0
def process_q_test(q, out_q):
    lucene.initVM()
    lucene.getVMEnv().attachCurrentThread()

    index = DirectoryReader.open(SimpleFSDirectory(
        Paths.get(robust_index_dir)))
    searcher = IndexSearcher(index)
    searcher.setSimilarity(BM25Similarity())
    analyzer = EnglishAnalyzer()
    qparser = QueryParser("contents", analyzer)
    preprocessor = Preprocess()

    while not exitFlag:
        qid, query = q.get()
        tname = multiprocessing.current_process().name
        # print(tname, qid, query)
        if query == "DONE":
            break

        try:
            # dids, scores = get_lm_matched_docs(query, searcher, qparser, 2000)
            # if len(dids) >= 10:
            #     out_q.put((qid, dids, scores))
            dids_text = get_lm_doc_snippets(query, searcher, qparser, analyzer,
                                            preprocessor)
            out_q.put((qid, dids_text))
        except:
            print('%s exception %s, %s' % (tname, qid, query))
Exemplo n.º 3
0
def main():
    INDEX_DIR = "indexes"
    try:
        print "Indexing..."
        indexDir = File("/home/ubuntu/Desktop/CoCaBu_remote/GitSearch/Indices")

        #writer = IndexWriter(SimpleFSDirectory(indexDir), StandardAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED)
        analyzer = KeywordAnalyzer(
        )  #PorterAnalyzer( StandardAnalyzer(Version.LUCENE_CURRENT))
        a = {
            "code": JavaCodeAnalyzer(),
            "comments": EnglishAnalyzer(Version.LUCENE_CURRENT)
        }
        wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a)
        config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer)
        writer = IndexWriter(SimpleFSDirectory(indexDir), config)

        index_code_snippet(writer)

        writer.close()
    except CorruptIndexException as e:  #when index is corrupt
        e.printStackTrace()
    except LockObtainFailedException as e:  #when other writer is using the index
        e.printStackTrace()
    except IOException as e:  #when directory can't be read/written
        e.printStackTrace()
Exemplo n.º 4
0
def main():
    try:
        print "Indexing starts..."
        # indicesDestination = File("/Users/Falcon/Desktop/dyclink_2014")############################################

        indicesDestination = File("/Indices/dyclink/2014")

        analyzer = KeywordAnalyzer(
        )  #전체 텍스트를 하나의 토큰으로 다룬다. (즉, Analyze 하지 않는 것과 결과적으로 동일하다.)
        a = {
            "code": JavaCodeAnalyzer(),
            "comments": EnglishAnalyzer(Version.LUCENE_CURRENT)
        }  #PerFieldAnalyzerWrapper를 사용하기 위한 map 생성 (Python 에서는 Dict())
        wrapper_analyzer = PerFieldAnalyzerWrapper(
            analyzer, a
        )  #http://svn.apache.org/viewvc/lucene/pylucene/trunk/test/test_PerFieldAnalyzerWrapper.py?revision=1757704&view=co
        config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer)

        writer = IndexWriter(SimpleFSDirectory(indicesDestination), config)

        counter = Counter()
        generate_indices_from_projects(writer, counter)
        writer.close()

        print "Done"
        print str(counter)

    except CorruptIndexException as e:  #when index is corrupt
        e.printStackTrace()
    except LockObtainFailedException as e:  #when other writer is using the index
        e.printStackTrace()
    except IOException as e:  #when directory can't be read/written
        e.printStackTrace()
Exemplo n.º 5
0
 def publish_services(self, service_list):
     transformer = WSDLTransformer()
     current_document = 1
     indexDir = SimpleFSDirectory(File("index/"))
     writerConfig = IndexWriterConfig(
         Version.LUCENE_CURRENT, EnglishAnalyzer(Version.LUCENE_CURRENT))
     writerConfig.setSimilarity(BM25Similarity())
     index_writer = IndexWriter(indexDir, writerConfig)
     for wsdl in service_list:
         if self._document_expansion:
             #bag_of_words = ' '.join(self._preprocessor(self._semantic_transformer.transform(transformer.transform(wsdl))))
             bag_of_words = ' '.join(
                 self._semantic_transformer.transform(
                     transformer.transform(wsdl)))
         else:
             #bag_of_words = ' '.join(self._preprocessor(transformer.transform(wsdl)))
             bag_of_words = ' '.join(transformer.transform(wsdl))
         doc = Document()
         doc.add(
             Field("content", bag_of_words, Field.Store.YES,
                   Field.Index.ANALYZED))
         doc.add(Field("path", wsdl, Field.Store.YES, Field.Index.NO))
         index_writer.addDocument(doc)
         current_document += 1
     index_writer.close()
Exemplo n.º 6
0
def index(indexdir):
  lucene.initVM()
  indexDir = SimpleFSDirectory(File(indexdir))
  writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, EnglishAnalyzer())
  writer = IndexWriter(indexDir, writerConfig)

  f = open('data/docid.documento-xml.txt')
  st = PorterStemmer()
  for i, line in enumerate(f.readlines()):
    id, xmltext = line.split('\t')
    xmltext = xmltext.rstrip('\n')
    xmldoc = minidom.parseString(xmltext)
    title = xmldoc.getElementsByTagName("TITLE")
    title = "" if len(title) == 0 else title[0].childNodes[0].nodeValue
    authors = xmldoc.getElementsByTagName("AUTHORS")
    authors = "" if len(authors) == 0 else authors[0].childNodes[0].nodeValue
    abstract = xmldoc.getElementsByTagName("ABSTRACT")
    abstract = "" if len(abstract) == 0 else abstract[0].childNodes[0].nodeValue
    doc = Document()
    doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED))
    doc.add(Field("authors", authors, Field.Store.YES, Field.Index.ANALYZED))
    doc.add(Field("abstract", abstract, Field.Store.YES, Field.Index.ANALYZED))
    doc.add(Field("id", id, Field.Store.YES, Field.Index.NOT_ANALYZED))
    writer.addDocument(doc)
    print "indexed %s docs" % (i+1)

  writer.close()
Exemplo n.º 7
0
def tokenize_text(text):
    # remove symbols
    ntext = re.sub(r'\W+', ' ', text)
    analyzer = EnglishAnalyzer()
    parser = StandardQueryParser(analyzer)
    parsed_text = parser.parse(ntext, '').toString('')
    parsed_text = re.sub('[)()]', '', parsed_text)
    return parsed_text
Exemplo n.º 8
0
 def run(self):
     print("Starting " + self.name)
     lucene.getVMEnv().attachCurrentThread()
     index = DirectoryReader.open(
         SimpleFSDirectory(Paths.get(robust_index_dir)))
     searcher = IndexSearcher(index)
     searcher.setSimilarity(BM25Similarity())
     analyzer = EnglishAnalyzer()
     qparser = QueryParser("contents", analyzer)
     # process_query(self.name, self.q, self.out_q, searcher, qparser)
     print("Exiting " + self.name)
Exemplo n.º 9
0
def main():
    facts = get_all_facts()
    print("Preparing to index {} facts".format(len(facts)))

    store_dir = "lucene_index"
    store = SimpleFSDirectory(Paths.get(store_dir))
    analyzer = EnglishAnalyzer()
    config = IndexWriterConfig(analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
    writer = IndexWriter(store, config)
    index_facts(facts, writer)
    writer.commit()
    writer.close()
    print("Lucene index created at: {}".format(store_dir))
Exemplo n.º 10
0
def main():
    store_dir = "lucene_index"
    if not os.path.isdir(store_dir):
        raise RuntimeError("Cannot find Lucene index at: {}".format(store_dir))
    store = SimpleFSDirectory(Paths.get(store_dir))
    searcher = IndexSearcher(DirectoryReader.open(store))
    analyzer = EnglishAnalyzer()

    # query_string = "House is a simple fact about science reaction"
    # query_string = get_random_question()
    # search(query_string, analyzer, searcher)
    # by_random_question(analyzer, searcher)
    annotate_all_questions(analyzer, searcher)
    del searcher
Exemplo n.º 11
0
 def find(self, query):
     transformer = StringTransformer()
     analyzer = EnglishAnalyzer(Version.LUCENE_CURRENT)
     reader = IndexReader.open(SimpleFSDirectory(File("index/")))
     searcher = IndexSearcher(reader)
     searcher.setSimilarity(BM25Similarity())
     processed_query = ' '.join(
         self._preprocessor(transformer.transform(query)))
     query = QueryParser(Version.LUCENE_CURRENT, "content",
                         analyzer).parse(processed_query)
     hits = searcher.get_description(query, 10)
     result_list = []
     for hit in hits.scoreDocs:
         doc = searcher.doc(hit.doc)
         result_list.append(doc.get("path").encode("utf-8"))
     return result_list
Exemplo n.º 12
0
    def __init__(self, lang):
        lucene.initVM()

        if lang == 'zh':
            indexDir = SimpleFSDirectory(Paths.get(str(config.IDX_COS_ZH)))
            analyzer = SmartChineseAnalyzer()
        elif lang == 'en':
            indexDir = SimpleFSDirectory(Paths.get(str(config.IDX_COS_EN)))
            analyzer = EnglishAnalyzer()
        else:
            raise ValueError(
                'lang should be "zh" or "en", {} is invalid!'.format(lang))

        self.reader = DirectoryReader.open(indexDir)
        self.searcher = IndexSearcher(self.reader)
        self.searcher.setSimilarity(mySimilarity())
        self.analyzer = analyzer
        logger.debug('search similarity func: {}'.format(
            self.searcher.getSimilarity()))
Exemplo n.º 13
0
    def __init__(self, lang):
        lucene.initVM()

        if lang == 'zh':
            logger.info("index directory:{}".format(config.IDX_COS_ZH))
            indexDir = SimpleFSDirectory(Paths.get(str(config.IDX_COS_ZH)))
            analyzer = SmartChineseAnalyzer()
        elif lang == 'en':
            logger.info("index directory:{}".format(config.IDX_COS_EN))
            indexDir = SimpleFSDirectory(Paths.get(str(config.IDX_COS_EN)))
            analyzer = EnglishAnalyzer()
        else:
            raise ValueError(
                'lang should be "zh" or "en", {} is invalid!'.format(lang))
        writerConfig = IndexWriterConfig(analyzer)
        writerConfig.setSimilarity(mySimilarity())
        logger.debug('writer similarity func: {}'.format(
            writerConfig.getSimilarity()))
        writer = IndexWriter(indexDir, writerConfig)
        self.writer = writer
Exemplo n.º 14
0
def retrieving(searchword):
    indexPath = File("indexOut/").toPath()
    indexDir = FSDirectory.open(indexPath)
    reader = DirectoryReader.open(indexDir)
    idxDocs = reader.maxDoc()
    print("We have ", idxDocs, " indexed documents")
    searcher = IndexSearcher(reader)
    idx_analyzer = EnglishAnalyzer()
    #Search for the input term in field stored as text
    # To look into multiple fields, try  MultiFieldQueryParser, but it is not recommended.
    # Its best to club everything we want to search into a single search field and try WildCard matching on it
    query = QueryParser("text", idx_analyzer).parse(searchword)
    MAX = 1000
    hits = searcher.search(query, MAX)
    print ("Found %d document(s) that matched query '%s':" % (hits.totalHits, query))
    try:
        for hit in hits.scoreDocs:
            print (hit.score, hit.doc, hit.toString())
            doc = searcher.doc(hit.doc)
            print (doc.get("text").encode("utf-8"))
    except:
        print("Could not find the word")
Exemplo n.º 15
0
def main(src, dst):
    try:
        start_time = time.time()

        print "Indexing starts..."
        indicesDestination = File(dst)
        #writer = IndexWriter(SimpleFSDirectory(indexDestination), StandardAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED)
        #Analyzer : 본문이나 제목 등의 텍스트를 색인하기 전에 반드시 분석기를 거쳐 단어로 분리해야 한다. Analyzer 클래스는 Directory와 함께 IndexWrite 클래스의 생성 메소드에 지정하며 지정된 텍슽트를 색인할 단위 단어로 분리하고 필요 없는 단어를 제거하는 등의 역할을 담당

        analyzer = KeywordAnalyzer(
        )  #전체 텍스트를 하나의 토큰으로 다룬다. (즉, Analyze 하지 않는 것과 결과적으로 동일하다.)
        a = {
            "code": JavaCodeAnalyzer(),
            "comments": EnglishAnalyzer(Version.LUCENE_CURRENT)
        }  #PerFieldAnalyzerWrapper를 사용하기 위한 map 생성 (Python 에서는 Dict())
        wrapper_analyzer = PerFieldAnalyzerWrapper(
            analyzer, a
        )  #http://svn.apache.org/viewvc/lucene/pylucene/trunk/test/test_PerFieldAnalyzerWrapper.py?revision=1757704&view=co
        config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer)

        writer = IndexWriter(SimpleFSDirectory(indicesDestination), config)
        #SimpleFSDirectory 옵션은 파일시스템에 특정 디렉토리에 인덱스 파일을 저장하겠다. DB, RAM, File system 3개가 있음
        #config 는 IndexWriter 사용에 필요한 Analyzed 된 token이다.

        counter = Counter()
        generate_indices_from_projects(src, writer, counter)
        writer.close()
        print "Done"
        print str(counter)
        print "$$$%s\tseconds" % (time.time() - start_time)

    except CorruptIndexException as e:  #when index is corrupt
        e.printStackTrace()
    except LockObtainFailedException as e:  #when other writer is using the index
        e.printStackTrace()
    except IOException as e:  #when directory can't be read/written
        e.printStackTrace()
Exemplo n.º 16
0
def indexing(datadir):
    indexedDocs = 0
    doc = Document()
    #index_outdir = str(input("Enter index output dir: "))
    path = Paths.get('indexOut')
    indexOut = SimpleFSDirectory(path)
    analyzer = EnglishAnalyzer()
    config = IndexWriterConfig(analyzer)
    writer = IndexWriter(indexOut, config)
    for filename in glob.iglob(datadir + '/*.json*', recursive=True):
        try:
            print("Filename is", filename)
            #pdb.set_trace()
            with open(filename) as f:
                for line in f:
                    tweet=json.loads(line)
                    if(tweet['lang']=='en'):
                        doc.add(StringField("id", tweet['id_str'], Field.Store.YES))
                    # doc.add(Field("screen_name", tweet['user.screen_name']))
                    # print(tweet['user.screen_name'])
                    # doc.add(Field("name", tweet['user.name']))
                    #doc.add(Field("location", tweet['user.location']))
                    #print(tweet['user.location'])
                        doc.add(TextField("text",tweet['text'],Field.Store.YES))
                    #doc.add(Field("created_at", DateTools.stringToDate(tweet['created_at']),Field.Store.YES))
                        doc.add(TextField("created_at", tweet['created_at'], Field.Store.YES))
                    # doc.add(IntPoint("followers", tweet['user.followers_count'],Field.Store.YES))
                    # doc.add(IntPoint("friends", tweet['friends_count'],Field.Store.YES))
                        writer.addDocument(doc)
                        writer.commit()
                        indexedDocs+=1
        except:
            continue


    writer.close()
    print("Indexed ", indexedDocs, " documents")
Exemplo n.º 17
0
def main():
	try:
		print "Indexing starts..."
		indicesDestination = File("/Users/Falcon/Desktop/New_Indices/IJA_Indices")

		analyzer = KeywordAnalyzer()  
		a = {"code": JavaCodeAnalyzer(), "comments": EnglishAnalyzer(Version.LUCENE_CURRENT)}
		wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a) 				
		config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer)

		writer = IndexWriter(SimpleFSDirectory(indicesDestination), config)
		counter = Counter()
		generate_indices_from_projects(writer, counter)
		writer.close()

		print "Done"
		print str(counter)

	except CorruptIndexException as e:		#when index is corrupt
			e.printStackTrace()
	except LockObtainFailedException as e:	#when other writer is using the index
			e.printStackTrace()
	except IOException as e:	#when directory can't be read/written
			e.printStackTrace()

def run(searcher, analyzer):
    while True:
        print
        print "Hit enter with no input to quit."
        command = raw_input("Query:")
        if command == '':
            return
        print
        print "Searching for:", command
        query = QueryParser("contents", analyzer).parse(command)
        scoreDocs = searcher.search(query, 50).scoreDocs
        print "%s total matching documents." % len(scoreDocs)

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            print 'path:', doc.get("path"), 'name:', doc.get("name"), (
                'score: %f' % (scoreDoc.score))


if __name__ == '__main__':
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    print 'lucene', lucene.VERSION
    directory = SimpleFSDirectory(Paths.get(os.getcwd(), INDEX_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    searcher.setSimilarity(ClassicSimilarity())
    analyzer = EnglishAnalyzer()
    run(searcher, analyzer)
    del searcher
                    contents = unicode(file.read(), 'iso-8859-1')
                    file.close()
                    doc = Document()
                    doc.add(Field("name", filename, t1))
                    doc.add(Field("path", root, t1))
                    if len(contents) > 0:
                        doc.add(Field("contents", contents, t2))
                    else:
                        print "warning: no content in %s" % filename
                    writer.addDocument(doc)
                except Exception, e:
                    print "Failed in indexDocs:", e


if __name__ == '__main__':
    if len(sys.argv) < 2:
        print IndexFiles.__doc__
        sys.exit(1)
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    print 'lucene', lucene.VERSION
    start = datetime.now()
    try:
        base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
        IndexFiles(sys.argv[1], os.path.join(base_dir, INDEX_DIR),
                   EnglishAnalyzer())
        end = datetime.now()
        print end - start
    except Exception, e:
        print "Failed: ", e
        raise e