Exemplo n.º 1
0
 def __init__(self):
   # Create index directory
   lucene.initVM(lucene.CLASSPATH)
   if not os.path.exists(STORE_DIR):
     os.mkdir(STORE_DIR)
   self.store = lucene.SimpleFSDirectory(lucene.File(STORE_DIR))
   self.im = IndexManager()
Exemplo n.º 2
0
    def __init__(self, root, store_dir):

        if not os.path.exists(store_dir):
            os.mkdir(store_dir, 0777)


        # NOTE: Hardcoded the analyzer instead of passing it
        lucene.initVM()
        '''
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()
        '''
        analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        store = SimpleFSDirectory(File(store_dir))
        analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)

        # Set the permissions to 777 for the index directory and the write.lock file
        chmod_indexdir_cmd = "chmod 0777 " + store_dir
        writelock_file = store_dir + "/" + "write.lock"
        chmod_writelock_cmd = "chmod 0777 " + writelock_file

        if os.path.exists(store_dir):
            cicmd=os.popen("sudo -S %s"%(chmod_indexdir_cmd), 'w').write('vagrant')

        if os.path.exists(writelock_file):
            cwcmd=os.popen("sudo -S %s"%(chmod_writelock_cmd), 'w').write('vagrant')

        # setting CREATE will rewrite over the existing indexes.
        ###config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)

        writer = IndexWriter(store, config)

        self.indexDocs(root, writer)
        writer.close()
Exemplo n.º 3
0
def start_jvm(clspath='', vmargs=''):
    """Starts the JVM - note that only the first initVM() is effective for java VM!
    Make sure you pass important arguments for the first call, because they will
    set the environment of the Java VM
    @keyword clspath: platform-separator separated values that will be passed to
        the Java VM
    @keyword vmargs: other arguments, comma-separated to pass to the java VM
    @return: JVM object (either new one or already existing one)"""


    #initialize the JVM if not already initialized
    jvm = _jcc_module.getVMEnv()
    if not jvm:
        classpath = []
        if clspath:
            classpath.append(clspath)
        if dumeanj:
            classpath.append(dumeanj.CLASSPATH)
        if lucene:
            classpath.append(lucene.CLASSPATH)
        jvm = _jcc_module.initVM(os.pathsep.join(classpath), vmargs=vmargs)

        if lucene != _jcc_module:
            lucene.initVM(lucene.CLASSPATH)
    elif vmargs:
        raise Exception('initVM() was already started, the second call will be ineffective. Please make sure you are initializing components in the right order!')

    return jvm
Exemplo n.º 4
0
	def _init_search(cls):
		"""
		Initializes everything needed for search.
		"""
		config_path = cls.search_config_path
		if not os.path.exists(config_path):
			raise OSError(errno.ENOENT, "Config %r does not exist." % config_path, config_path)
		config_dir = os.path.dirname(config_path)
		
		# Read config.
		with open(config_path, 'rb') as fh:
			config = json.load(fh)
		cls.search_config = config
		
		# Connect to mongo.
		host = config['mongo']['host']
		port = config['mongo'].get('port', None) or 27017
		thread_pool = reactor.getThreadPool()
		pool_size = int(math.ceil((thread_pool.min + thread_pool.max) / 2))
		cls.search_mongo = txmongo.lazyMongoConnectionPool(host=host, port=port, pool_size=pool_size)
		cls.search_order_db = cls.search_mongo[config['mongo']['order_dbname']]
		cls.search_order_tb = cls.search_order_db[config['mongo']['order_tbname']]
		
		# Initialize PyLucene.
		lucene.initVM()
		
		# Open index.
		index_path = os.path.abspath(os.path.join(config_dir, config['lucene']['index_path']))
		if not os.path.exists(index_path):
			raise OSError(errno.ENOENT, "Index %r does not exist." % index_path, index_path)
		elif not os.path.isdir(index_path):
			raise OSError(errno.ENOTDIR, "Index %r is not a directory." % index_path, index_path)
		index_dir = lucene.NIOFSDirectory(lucene.File(index_path))
		#index_dir = lucene.SimpleFSDirectory(lucene.File(index_path)) # windows
		cls.search_searcher = lucene.IndexSearcher(index_dir)
Exemplo n.º 5
0
def search(r, keyword=""):
    import logging

    logger = logging.getLogger("search")
    bench = Benchmark(logger)
    from lucene import IndexSearcher, StandardAnalyzer, FSDirectory, QueryParser, File, Hit
    import lucene, os

    os.environ["JAVA_HOME"] = "/usr/local/jdk1.6.0_17"
    lucene.initVM(lucene.CLASSPATH)

    directory = FSDirectory.open(File(CONFIG.INDEX_PATH))
    ROBOT_INDEX = IndexSearcher(directory, True)
    ROBOT_ANALYZER = StandardAnalyzer()

    keyword = keyword or r.GET["keyword"]
    query = QueryParser("context", ROBOT_ANALYZER)
    query = query.parse('"%s"' % keyword)

    bench.start_mark("search")
    hits = ROBOT_INDEX.search(query)
    count = len(hits)
    result = []
    i = 0
    for hit in hits:
        i += 1
        if i > 100:
            break
        doc = Hit.cast_(hit).getDocument()
        result.append(SearchResult(doc, i, keyword))
    ROBOT_INDEX.close()

    et = bench.stop_mark()

    return render_to_response("robot_search_result.html", {"result": result, "count": count, "elaspe": et})
Exemplo n.º 6
0
def create_index():

    lucene.initVM()
    if os.path.exists(prm.index_folder):
        shutil.rmtree(prm.index_folder)

    indexDir = SimpleFSDirectory(File(prm.index_folder))
    writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer())
    writer = IndexWriter(indexDir, writerConfig)
    wk = wiki.Wiki(prm.pages_path)

    print "%d docs in index" % writer.numDocs()
    print "Reading files from wikipedia..."
    n = 0
    for l in wk.get_text_iter():
        doc = Document()
        doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED))
        doc.add(Field("id", str(n), Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
        n += 1
        if n % 100000 == 0:
            print 'indexing article', n
    print "Indexed %d docs from wikipedia (%d docs in index)" % (n, writer.numDocs())
    print "Closing index of %d docs..." % writer.numDocs()
    writer.close()
def build_lda_corpus(index_folder, paths_index_file,  
                     dictionary_file, ldac_file, min_frequency, 
                     min_word_len, max_word_len=20):
    '''
    The main function that does the job! 
    
    '''
    initVM()  
    store = SimpleFSDirectory(File(index_folder))
    index_reader = IndexReader.open(store)

    # Stores the file paths index (for LDA)
    _store_file_paths_index(index_reader, paths_index_file) 
    
    # Creates the dictionary 
    _create_dictionary(index_reader, dictionary_file, min_frequency, 
                       min_word_len, max_word_len)

    # Creates the corpus 
    dictionary = corpora.Dictionary().load(dictionary_file)      
    # doesn't load the corpus into the memory! 
    corpus_memory_friendly = _TextCorpus(dictionary, index_reader) 
    corpora.BleiCorpus.serialize(ldac_file, corpus_memory_friendly, 
                                 id2word=dictionary)
    
    logging.info('The Enron corpus building is completed.')
Exemplo n.º 8
0
    def run(self):
        print "Booting lucene driver worker...."
        lucene.initVM()

        self.fieldType1 = FieldType()
        self.fieldType1.setIndexed(True)
        self.fieldType1.setStored(False)
        self.fieldType1.setTokenized(True)

        self.fieldType2 = FieldType()
        self.fieldType2.setIndexed(True)
        self.fieldType2.setStored(True)
        self.fieldType2.setTokenized(False)

        while(True):
            data = self.queue.get()
            da = data[1]
            response = None
            try:
                self.fil = File(da['data']['indexdir'])
                self.d = NIOFSDirectory(self.fil)
                self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
                self.conf = IndexWriterConfig(
                    Version.LUCENE_CURRENT,
                    self.analyzer)

                response = getattr(self, da['action'])(da['data'])
                self.d.close()
            except Exception as e:
                print e
            if response is None:
                response = {}

            self.ret[data[0]] = response
Exemplo n.º 9
0
def names():
    lst = []
    
    search = "spax"#request.form['product']
    lucene.initVM()
    
    dir = SimpleFSDirectory(File(indexDir))
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    searcher = IndexSearcher(dir)

    query = QueryParser(lucene.Version.LUCENE_CURRENT, "text", analyzer).parse(search)
    MAX = 1000
    hits = searcher.search(query, MAX)

    print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query)

    for hit in hits.scoreDocs:
        if hit.score >= 1:
            print hit.score, hit.doc, hit.toString()
            doc = searcher.doc(hit.doc)
            print doc.get("text").encode("utf-8")
            items = doc.get("text").encode("utf-8").split(',')
            for item in items:
                if item == search:
                    pass
                elif item not in lst:
                    lst.append(item)
    #print lst
    data = {"products": lst}
    if request.method == 'POST':
        return jsonify(data)
    else:
	return jsonify(data)
Exemplo n.º 10
0
	def retrieve( self, query, max_res = 10 ):
		lucene.initVM()
		inDir = SimpleFSDirectory( File( self.INDEX_DIR ) )
		lucene_analyzer = StandardAnalyzer( Version.LUCENE_30 )
		lucene_searcher = IndexSearcher( inDir )
		my_query = QueryParser( Version.LUCENE_30, 'content' , lucene_analyzer ).parse( query )
		MAX = max_res
		total_hits = lucene_searcher.search( my_query, MAX )
		res_head = '{"query":"' + query + '","results":['
		res_tail = ']}'
		result = res_head
		hits = total_hits.totalHits
		if ( hits > 0 ):
			res_body = ''
			it = 0
			for hit in total_hits.scoreDocs:
				it += 1
				doc = lucene_searcher.doc( hit.doc )
				res_body += '{"rank":' +\
							str( it ) +\
							',"score":"' +\
							str( hit.score ) +\
							'","title":"' +\
							doc.get( 'title' ).encode('utf-8') +\
							'","id":"' +\
							doc.get( 'id' ).encode('utf-8') +\
							'"}'
				if ( it < hits ):
					res_body += ','
			result += res_body
		result += res_tail
		return result
Exemplo n.º 11
0
	def document( self, docId, max_res = 1 ):
		lucene.initVM()
		inDir = SimpleFSDirectory( File( self.INDEX_DIR ) )
		lucene_analyzer = StandardAnalyzer( Version.LUCENE_30 )
		lucene_searcher = IndexSearcher( inDir )
		my_query = QueryParser( Version.LUCENE_30, 'id' , lucene_analyzer ).parse( docId )
		MAX = max_res
		total_hits = lucene_searcher.search( my_query, MAX )
		result = '{'
		hits = total_hits.totalHits
		if ( hits == 1 ):
			for hit in total_hits.scoreDocs:
				doc = lucene_searcher.doc( hit.doc )
				result += '"id":"' +\
						  doc.get( 'id' ) +\
						  '","title":"' +\
						  doc.get( 'title' ) +\
						  '","abstract":"' +\
						  doc.get( 'abstract' ) +\
						  '","keyword":"' +\
						  doc.get( 'keyword' ) +\
						  '","content":"' +\
						  doc.get( 'content' ) +\
						  '","authors":"' +\
						  doc.get( 'authors' ) +\
						  '"'
		result += '}'
		return result
Exemplo n.º 12
0
def initial_searcher():
    lucene.initVM()
    indexDir = INDEX_DIR 
    dir = SimpleFSDirectory(File(indexDir))
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    searcher = IndexSearcher(dir)
    return searcher, analyzer
Exemplo n.º 13
0
def lucene_search(index_dir, limit, query_text):
    '''
    lucene_search: Search a built index and return upto limit number of responses 
    Arguments: Input index folder, limit value of results returned, query(as string)
    Returns: paths of responsive files as list
    '''
    
    logging.basicConfig(file=os.path.join(index_dir,"lucene_search.log"))
    logger.info("Initializing search....")
    lucene.initVM()
    logger.info("Reading index from "+index_dir)
    index = SimpleFSDirectory(File(index_dir))
    analyzer = StandardAnalyzer(Version.LUCENE_30) #Lucene version used to generate index
    searcher = IndexSearcher(index)
    
    logger.info("Parsing query :"+ query_text)
    query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(query_text)
    hits = searcher.search(query, limit)

    logger.info("Found %d document(s) that matched query '%s':" % (hits.totalHits, query))
    hit_paths = []

    for hit in hits.scoreDocs:
        # The following code also generates score for responsive/found documents and the 
        # content index which matched
        # print hit.score, hit.doc, hit.toString()
        doc = searcher.doc(hit.doc)
        hit_paths.append(doc.get("path"))
    
    return hit_paths 
Exemplo n.º 14
0
def luceneRetriver(query):

	lucene.initVM()

	indir = SimpleFSDirectory(File(INDEXDIR))

	lucene_analyzer = StandardAnalyzer(Version.LUCENE_30)

	lucene_searcher = IndexSearcher(indir)

	my_query = QueryParser(Version.LUCENE_30,"text",\

	lucene_analyzer).parse(query)

	MAX = 1000

	total_hits = lucene_searcher.search(my_query,MAX)

	print "Hits: ",total_hits.totalHits

	for hit in total_hits.scoreDocs:

		print "Hit Score: ",hit.score, "Hit Doc:",hit.doc, "Hit String:",hit.toString()

		doc = lucene_searcher.doc(hit.doc)

		print doc.get("text").encode("utf-8")
	def search(self):
		''' Searches the given query in the index '''

		lucene.initVM(vmargs=['-Djava.awt.headless=true'])
		# print 'lucene', lucene.VERSION
		# base_dir = os.path.dirname(os.path.abspath('.'))
		base_dir = '.'
		directory = SimpleFSDirectory(File(os.path.join(base_dir, self.index_dir)))
		searcher = IndexSearcher(DirectoryReader.open(directory))
		analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
		

		while True:
			print
			print "Hit enter with no input to quit."
			command = raw_input("Query:")
			if command == '':
				return

			print
			print "Searching for:", command

			query = QueryParser(Version.LUCENE_CURRENT, "title",
								analyzer).parse(command)
			scoreDocs = searcher.search(query, 50).scoreDocs
			print "%s total matching documents." % len(scoreDocs)

			for scoreDoc in scoreDocs:
				doc = searcher.doc(scoreDoc.doc)
				# print 'path:', doc.get("path"), 'name:', doc.get("name")
				print doc
Exemplo n.º 16
0
def configure_lucene():
    
    f = open('clique.txt','r')
    lucene.initVM()
    print 'Inside Function'
    #indexDir = "/tmp/luceneindex"
    dir = SimpleFSDirectory(File(indexDir))
    analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT)
    writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))

    print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs()

    print >> sys.stderr, "Reading lines from sys.stdin..."
    for line in f:
        line = line.replace('\t','')
        line = line.replace('\r','')
        line = line.replace('\n','')
  	line = line.replace('^','')
    	line = line.strip()
        doc = Document()
        doc.add(Field("text", line, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

    print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (writer.numDocs())
    print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs()
    writer.optimize()
    print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs()
    print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs()
    writer.close()
Exemplo n.º 17
0
def build_index():

    lucene.initVM()

    # post_dir = current_app.config['LOCAL_REPO_PATH'] + '/_posts/'
    post_dir = '/Users/w3/data/github/codeif_backup'
    index_store_dir = current_app.config['INDEX_STORE_DIR']
    print post_dir
    print index_store_dir

    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    store = SimpleFSDirectory(File(index_store_dir))
    analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
    config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    writer = IndexWriter(store, config)

    indexDocs(post_dir, writer)
    ticker = Ticker()
    print 'commit index',
    threading.Thread(target=ticker.run).start()
    writer.commit()
    writer.close()
    ticker.tick = False
    print 'done'
Exemplo n.º 18
0
	def index( self ):
		lucene.initVM()
		indexdir = SimpleFSDirectory( File( self.INDEX_DIR ) )
		analyzer = StandardAnalyzer( Version.LUCENE_30 )
		index_writer = IndexWriter( indexdir, analyzer, True, IndexWriter.MaxFieldLength( 512 ) )
		# read input files (.xml)
		for in_file in glob.glob( os.path.join( self.DOC_DIR, '*.xml' ) ):
			corpus = codecs.open( in_file, encoding='utf-8' ).read()
			d = pq( corpus, parser='html' )
			for text in d( 'Article' ).items():
				document = Document()
				# find ID
				art_id = str( text.attr( 'articleid' ).encode( 'utf-8' ) ).replace( '+', '-' )
				# find Title
				art_title = self.stem( str( text.attr( 'title' ).encode( 'utf-8' ) ) )
				# find Abstract
				art_abstract = self.stem( str( text.find( 'Abstract' ).html().encode('utf-8') ) )
				# find Keyword
				art_keyword = text.find( 'Keyword' ).html().encode('utf-8')
				# find Content
				art_content = self.stem( str( text.find( 'Content' ).html().encode('utf-8') ) )
				# find Authors
				art_authors = text.find( 'Authors' ).html().encode('utf-8')
				document.add( Field( 'id', art_id, Field.Store.YES, Field.Index.ANALYZED ) )
				document.add( Field( 'title', art_title, Field.Store.YES, Field.Index.ANALYZED ) )
				document.add( Field( 'abstract', art_abstract, Field.Store.YES, Field.Index.ANALYZED ) )
				document.add( Field( 'keyword', art_keyword, Field.Store.YES, Field.Index.ANALYZED ) )
				document.add( Field( 'content', art_content, Field.Store.YES, Field.Index.ANALYZED ) )
				document.add( Field( 'authors', art_authors, Field.Store.YES, Field.Index.ANALYZED ) )
				document.add( Field( 'article', art_title + art_abstract + art_keyword + art_content,\
									 Field.Store.YES,\
									 Field.Index.ANALYZED ) )
				index_writer.addDocument( document )
			index_writer.optimize()
			index_writer.close()
Exemplo n.º 19
0
def get_candidates(qatp):

    if prm.create_index:
        create_index()

    lucene.initVM()
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
    reader = IndexReader.open(SimpleFSDirectory(File(prm.index_folder)))
    searcher = IndexSearcher(reader)
    candidates = []
    n = 0
    for q,a,t,p in qatp:
        if n % 100 == 0:
            print 'finding candidates sample', n
        n+=1

        q = q.replace('AND','\\AND').replace('OR','\\OR').replace('NOT','\\NOT')
        query = QueryParser(Version.LUCENE_4_10_1, "text", analyzer).parse(QueryParser.escape(q))
        hits = searcher.search(query, prm.max_candidates)
        c = []
        for hit in hits.scoreDocs:
            doc = searcher.doc(hit.doc)
            c.append(doc.get("id"))

        candidates.append(c)
        
    return candidates
Exemplo n.º 20
0
	def __init__(self, writerConfig, indexDir):
		
		lucene.initVM()

		self.mIndexDir = SimpleFSDirectory(File(indexDir))
		self.mConfig = writerConfig
		self.mWriter = IndexWriter(self.mIndexDir, self.mConfig)
Exemplo n.º 21
0
def index(string):
 lucene.initVM()
 indexDir = "REMOVEME.index-dir"
 dir = SimpleFSDirectory(File(indexDir))
 analyzer = StandardAnalyzer(Version.LUCENE_30)
 try:
  writer = IndexWriter(dir, analyzer, False, IndexWriter.MaxFieldLength(512))
 except lucene.JavaError:
  #print 'Inside Index Except'
  writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))
#e = sys.exc_info()[0]
#print e
 #print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs()

 doc = Document()
 doc.add(Field("text", string, Field.Store.YES, Field.Index.ANALYZED))
 writer.addDocument(doc)
 #print 'In the index function'
 #print writer.numDocs()

#print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (writer.numDocs())
#print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs()
 writer.optimize()
#print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs()
#print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs()
 #print 'ending Indexing'
 #print string 
 #print 'Total indexes'
 #print writer.numDocs() 
 writer.close()
def index(indexdir):
  lucene.initVM()
  indexDir = SimpleFSDirectory(File(indexdir))
  writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, EnglishAnalyzer())
  writer = IndexWriter(indexDir, writerConfig)

  f = open('data/docid.documento-xml.txt')
  st = PorterStemmer()
  for i, line in enumerate(f.readlines()):
    id, xmltext = line.split('\t')
    xmltext = xmltext.rstrip('\n')
    xmldoc = minidom.parseString(xmltext)
    title = xmldoc.getElementsByTagName("TITLE")
    title = "" if len(title) == 0 else title[0].childNodes[0].nodeValue
    authors = xmldoc.getElementsByTagName("AUTHORS")
    authors = "" if len(authors) == 0 else authors[0].childNodes[0].nodeValue
    abstract = xmldoc.getElementsByTagName("ABSTRACT")
    abstract = "" if len(abstract) == 0 else abstract[0].childNodes[0].nodeValue
    doc = Document()
    doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED))
    doc.add(Field("authors", authors, Field.Store.YES, Field.Index.ANALYZED))
    doc.add(Field("abstract", abstract, Field.Store.YES, Field.Index.ANALYZED))
    doc.add(Field("id", id, Field.Store.YES, Field.Index.NOT_ANALYZED))
    writer.addDocument(doc)
    print "indexed %s docs" % (i+1)

  writer.close()
Exemplo n.º 23
0
def search(request, template_name="reviews/search.html"):
    """
    Searches review requests on Review Board based on a query string.
    """
    query = request.GET.get("q", "")
    siteconfig = SiteConfiguration.objects.get_current()

    if not siteconfig.get("search_enable"):
        # FIXME: show something useful
        raise Http404

    if not query:
        # FIXME: I'm not super thrilled with this
        return HttpResponseRedirect(reverse("root"))

    import lucene

    # We may have already initialized lucene
    try:
        lucene.initVM(lucene.CLASSPATH)
    except ValueError:
        pass

    index_file = siteconfig.get("search_index_file")
    store = lucene.FSDirectory.getDirectory(index_file, False)
    try:
        searcher = lucene.IndexSearcher(store)
    except lucene.JavaError, e:
        # FIXME: show a useful error
        raise e
def retrieve(indexdir, queries):
    lucene.initVM()
    f = open("results_lucene.txt", "w")
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
    reader = IndexReader.open(SimpleFSDirectory(File(indexdir)))
    searcher = IndexSearcher(reader)

    fields = ["title", "abstract", "authors"]

    st = PorterStemmer()
    for id, q in queries.iteritems():
        query = q
        tokenizer = RegexpTokenizer(r'\w+')
        qwords = tokenizer.tokenize(query)
        qwords_k = [st.stem(q) for q in qwords]
        query = " ".join(qwords_k)
        parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields, analyzer)
        parser.setDefaultOperator(QueryParserBase.OR_OPERATOR)
        query = MultiFieldQueryParser.parse(parser, query)
        MAX = 1000
        hits = searcher.search(query, MAX)
        # print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query)
        for i, hit in enumerate(hits.scoreDocs):
            f.write("%s Q0 %s %s %s G17R3\n" % (id, hit.doc+1, i+1, hit.score))
            # print hit.doc+1, hit.score
            # doc = searcher.doc(hit.doc)
            # print doc.get("authors").encode("utf-8")
    f.close()
Exemplo n.º 25
0
def irsolver(data_file, index) :
	from questions import get_input_data
	lucene.initVM()
	stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
	for s in stopwords :
		stops.add(s)
	analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
	reader = IndexReader.open(SimpleFSDirectory(File(index)))
	searcher = IndexSearcher(reader)
	pred = []
	mapp = { 1 : 'A', 2 : 'B', 3 : 'C', 4 : 'D'}

	idx, ques, ans = get_input_data(data_file)
	for acm, (idq, q, a) in enumerate(zip(idx, ques, ans)) :
		max_score = -1000000
		best_ans = 'A'
		for i, ai in enumerate(a):
			sc = query(q, ai, analyzer, searcher)
			print(acm, i, sc)
			if sc > max_score :
				max_score = sc
				best_ans = mapp[i+1]
		pred.append(best_ans)

	return idx, pred
Exemplo n.º 26
0
def wikipedia_indexer(storage, wikipedia_file) :
	lucene.initVM()
	indexDir = SimpleFSDirectory(File(storage))
	stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
	for s in stopwords :
		stops.add(s)
	analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
	writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
	writer = IndexWriter(indexDir, writerConfig)

	print "%d docs in index" % writer.numDocs()
	print "Reading Documents"

	f = open(wikipedia_file)

	for i, line in enumerate(f) :
		text = line.strip().decode('utf-8').split('\t')
		title = text[0]
		if 'disambigu' in text[0] or len(text) < 2:
			continue
		text = text[1]
		doc = Document()
		doc.add(Field("num", str(i), Field.Store.YES, Field.Index.NO))
		doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED))
		doc.add(Field("text", text, Field.Store.NO, Field.Index.ANALYZED))
		writer.addDocument(doc)
		if writer.numDocs() % 1000 == 0 :
			print "Indexed (%d docs in index) Last %d" % (writer.numDocs(), i)
		
	print "Closing index of %d docs..." % writer.numDocs()
	writer.close()	
    def build_words_index(self):
        relevant_words = self.process_texts()

        # Initialize lucene and JVM
        lucene.initVM()

        # Get the analyzer
        analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

        # Get index storage
        store = SimpleFSDirectory(File(self.index_words))

        # Get index writer
        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        writer = IndexWriter(store, config)

        try:
            for word in relevant_words:
                time_series = TimeSeries(word).get_series()
                series_str  = ''
                for t in time_series:
                    series_str += str(t) + ':' + str(time_series[t]) + '\t'
                doc = Document()
                # Add a fields to this document
                doc.add(Field('word', word, Field.Store.YES, Field.Index.ANALYZED))
                doc.add(Field('series', series_str, Field.Store.YES, Field.Index.ANALYZED))
                # Add the document to the index
                writer.addDocument(doc)
        except Exception, e:
            print "Failed in creating document to add to the index:", e
Exemplo n.º 28
0
def ExportIndex(b_print = False,b_write_file = False,b_filter = True):
    _dict = ReadConfig()
    initVM()
    try:
        if(b_write_file == True):
            output_file = _dict['resultDir'] + '/' + sys.argv[1] + '.xls'
            _fw = open(output_file,'w')
        directory = SimpleFSDirectory(File(_dict['indexDir']))
        ireader = IndexReader.open(directory)
        # Enum all the terms
        all_terms = ireader.terms()
        word_dict = {}
        _stopword_set = ImportStopword()
#        SetPrint(_stopword_set)
        while all_terms.next():
            term_elem = all_terms.term()
            if term_elem.field() == sys.argv[1]:
                _temp = term_elem.text().rstrip()
                word_dict[_temp] = all_terms.docFreq()
        if(b_filter == True):
            StopwordFilter(word_dict,_stopword_set)
        if(b_print != False):
            DictPrint(word_dict)
        if(b_write_file != False):
            DictPrint(word_dict,out_file=_fw)
            _fw.close()
        all_terms.close()
        return word_dict
    except Exception,e:
        print "Failed: ",e
        traceback.print_exc(file=sys.stdout)
Exemplo n.º 29
0
def search():

	lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    
	args = []
	if request.method == 'POST':
		if request.form['ies']:
			args.append('+ies:'+request.form['ies'])
		if request.form['area']:
			args.append('+area:'+request.form['area'])
		if request.form['professor']:
			args.append('+professor:'+request.form['professor'])
		if request.form['conceito']:
			#args.append('m:'+request.form['conceito']+'d:'+request.form['conceito']+'f:'+request.form['conceito'])
			args.append('m:'+request.form['conceito'])
			args.append('d:'+request.form['conceito'])
			args.append('f:'+request.form['conceito'])

	table = []
	if(len(args) > 0): 
		scoreDocs = mansearch.buscar('indexer/',args)
		fsDir = SimpleFSDirectory(File(indexDir))
		searcher = IndexSearcher(DirectoryReader.open(fsDir))
		for scoreDoc in scoreDocs:
			doc = searcher.doc(scoreDoc.doc)
			table.append(dict((field.name(), field.stringValue()) for field in doc.getFields()))
	return render_template('busca.html',table = table)
	
	pass
Exemplo n.º 30
0
def create_index(storage, paths) :
	lucene.initVM()
	indexDir = SimpleFSDirectory(File(storage))
	stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
	for s in stopwords :
		stops.add(s)
	analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
	writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
	writer = IndexWriter(indexDir, writerConfig)

	print "%d docs in index" % writer.numDocs()
	print "Reading Documents"

	import os
	for path in paths :
		for filen in os.listdir(path) :
			text = sent_tokenize(get_data_from_file(path + filen))
			total_sent = len(text)
			for i in range(0, total_sent, 3) :
				doc = Document()
				a = i-5 if i-5 > 0 else 0
				sentence = ' '.join(text[a:i+5])
				doc.add(Field("text", sentence, Field.Store.YES, Field.Index.ANALYZED))
				writer.addDocument(doc)
			print("Done %s" % (path+filen))
			print "Indexed (%d docs in index)" % (writer.numDocs())
	print "Closing index of %d docs..." % writer.numDocs()
	writer.close()