Exemplo n.º 1
0
def luceneRetriver(query):

    lucene.initVM()

    indir = SimpleFSDirectory(File(INDEXDIR))

    lucene_analyzer = StandardAnalyzer(Version.LUCENE_30)

    lucene_searcher = IndexSearcher(indir)

    my_query= QueryParser(Version.LUCENE_30,"text",\

    lucene_analyzer).parse(query)

    MAX = 1000

    total_hits = lucene_searcher.search(my_query, MAX)

    print "Hits: ", total_hits.totalHits

    for hit in total_hits.scoreDocs:

        print "Hit Score: ", hit.score, "Hit Doc:", hit.doc, "HitString:", hit.toString(
        )

        doc = lucene_searcher.doc(hit.doc)

        print doc.get("text").encode("utf-8")
Exemplo n.º 2
0
	def retrieve( self, query, max_res = 10 ):
		lucene.initVM()
		inDir = SimpleFSDirectory( File( self.INDEX_DIR ) )
		lucene_analyzer = StandardAnalyzer( Version.LUCENE_30 )
		lucene_searcher = IndexSearcher( inDir )
		my_query = QueryParser( Version.LUCENE_30, 'content' , lucene_analyzer ).parse( query )
		MAX = max_res
		total_hits = lucene_searcher.search( my_query, MAX )
		res_head = '{"query":"' + query + '","results":['
		res_tail = ']}'
		result = res_head
		hits = total_hits.totalHits
		if ( hits > 0 ):
			res_body = ''
			it = 0
			for hit in total_hits.scoreDocs:
				it += 1
				doc = lucene_searcher.doc( hit.doc )
				res_body += '{"rank":' +\
							str( it ) +\
							',"score":"' +\
							str( hit.score ) +\
							'","title":"' +\
							doc.get( 'title' ).encode('utf-8') +\
							'","id":"' +\
							doc.get( 'id' ).encode('utf-8') +\
							'"}'
				if ( it < hits ):
					res_body += ','
			result += res_body
		result += res_tail
		return result
Exemplo n.º 3
0
def run(writer, analyzer):
	while True:
		print 
		print "Hit enter with no input to quit."
		command = raw_input("Query:")
		if command == '':
			return

		print "Searching for:", command
		IndexReader = writer.getReader()
		searcher = IndexSearcher(IndexReader)
		#query = QueryParser(Version.LUCENE_CURRENT, "hashtag", analyzer).parse(command)
		#scoreDocs = searcher.search(query, 50).scoreDocs
		wildquery = command + "*"
		term = Term("hashtag", wildquery)
		query = WildcardQuery(term)
		scoreDocs = searcher.search(query, 5).scoreDocs
		print "%s total matching documents." % len(scoreDocs)
		
		for scoreDoc in scoreDocs:
			doc = searcher.doc(scoreDoc.doc)
			
			score = ( len(command) / len(doc.get("hashtag")) ) * scoreDoc.score
			print 'tweet:', doc.get("contents")
			print 'user_name:', doc.get("user_name")
			print 'when', doc.get("creation_date")
Exemplo n.º 4
0
    def displayResults(self, query, sort):

        searcher = IndexSearcher(self.directory, True)

        fillFields = False
        computeMaxScore = False
        docsScoredInOrder = False
        computeScores = True

        collector = TopFieldCollector.create(sort, 20,
                                             fillFields,
                                             computeScores,
                                             computeMaxScore,
                                             docsScoredInOrder)

        searcher.search(query, None, collector)
        scoreDocs = collector.topDocs().scoreDocs

        print "\nResults for:", query, "sorted by", sort
        print "Title".rjust(30), "pubmonth".rjust(10), \
              "id".center(4), "score".center(15)

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            title = doc["title"]
            if len(title) > 30:
                title = title[:30]
            print title.encode('ascii', 'replace').rjust(30), \
                  doc["pubmonth"].rjust(10), \
                  str(scoreDoc.doc).center(4), \
                  ("%06f" % (scoreDoc.score)).rjust(12)
            print "  ", doc["category"]
            # print searcher.explain(query, scoreDoc.doc)

        searcher.close()
Exemplo n.º 5
0
    def post(self):
      q= self.get_argument("query")

      # self.write(key)

    # def query(query):
      # query = self.get_argument("q")
      lucene.initVM()
      indexDir = "index"
      dir = SimpleFSDirectory(File(indexDir))
      analyzer = StandardAnalyzer(Version.LUCENE_30)
      searcher = IndexSearcher(dir)
      
      query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(q)
      MAX = 10
      hits = searcher.search(query, MAX)
      
      print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query)
      items = []
      rQ = []
      
      #for key, value in doc_urls.iteritems() 
       # print (key, value)

      for hit in hits.scoreDocs:
          #items.append({'score':hit.score, 'doc':hit.doc, 'blah':hit.toString(), 'url':doc_urls[str(hit.doc)]})
          print hit.score, hit.doc, hit.toString()
          print(len(doc_urls))
          items.append(doc_urls[str(hit.doc)])
          doc = searcher.doc(hit.doc) 
          print(hit.doc)
        
      self.render("index.html", title="Results", items=items, query=q)
Exemplo n.º 6
0
def get_indexed_file_details(ts_results, lucene_index_dir):
    '''
    This function gets each files details from the lucene 
    index. 
    
    Arguments: 
        ts_results - topic search results, each item contains 
                     [file id, root, file name, similarity score]
        lucene_index_dir - lucene index directory 
    
    Returns: 
        file details in a list 
    '''
    
    store = SimpleFSDirectory(File(lucene_index_dir))
    searcher = IndexSearcher(store, True)
    
    rows = []
    for rs in ts_results:
        doc = searcher.doc(rs[0])
        table = dict((field.name(), field.stringValue())
                     for field in doc.getFields())
        row = []
        metadata = MetadataType._types
        for field in metadata:
            if table.get(field,'empty') != 'empty' :
                row.append(table.get(field,'empty'))
            else: 
                row.append('')
        row.append(str(table.get(MetadataType.FILE_ID,'empty')))
        row.append(str(rs[3])) # similarity score
        
        rows.append(row)
    
    return rows
Exemplo n.º 7
0
def retrieveDocs(q):
    lucene.initVM()
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    reader = IndexReader.open(SimpleFSDirectory(File("index/")))
    searcher = IndexSearcher(reader)

    query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(q)
    MAX = 1000
    hits = searcher.search(query, MAX)
    nonDiverse = []
    docsToScores = {}
    #create a list of html files with relevant websites
    rQ = []
    print "Found %d document(s) that matched query '%s':" % (hits.totalHits,
                                                             query)
    for hit in hits.scoreDocs:
        print hit.score, hit.doc, hit.toString()
        doc = searcher.doc(hit.doc)
        print doc.get("text").encode("utf-8")
        #print(new_urls[str(hit.doc)])
        result = str(hit.score) + " " + str(hit.doc) + " " + hit.toString()
        if (len(nonDiverse) < 10):
            nonDiverse.append(new_urls[str(hit.doc)])
        #find the document that corresponds to the html website and append to a list for min distance
        website = new_urls[str(hit.doc)]
        #html_files numbers of the hit websites added to rQ
        rQ.append(inv_map[website])
        docsToScores[int(inv_map[website])] = hit.score
        print(inv_map[website])
    return docsToScores, rQ, nonDiverse
Exemplo n.º 8
0
	def document( self, docId, max_res = 1 ):
		lucene.initVM()
		inDir = SimpleFSDirectory( File( self.INDEX_DIR ) )
		lucene_analyzer = StandardAnalyzer( Version.LUCENE_30 )
		lucene_searcher = IndexSearcher( inDir )
		my_query = QueryParser( Version.LUCENE_30, 'id' , lucene_analyzer ).parse( docId )
		MAX = max_res
		total_hits = lucene_searcher.search( my_query, MAX )
		result = '{'
		hits = total_hits.totalHits
		if ( hits == 1 ):
			for hit in total_hits.scoreDocs:
				doc = lucene_searcher.doc( hit.doc )
				result += '"id":"' +\
						  doc.get( 'id' ) +\
						  '","title":"' +\
						  doc.get( 'title' ) +\
						  '","abstract":"' +\
						  doc.get( 'abstract' ) +\
						  '","keyword":"' +\
						  doc.get( 'keyword' ) +\
						  '","content":"' +\
						  doc.get( 'content' ) +\
						  '","authors":"' +\
						  doc.get( 'authors' ) +\
						  '"'
		result += '}'
		return result
Exemplo n.º 9
0
def get_doc_details(doc_id, lucene_index_dir):
    '''
    This function gets a file's details from 
    the lucene index. 
    
    Arguments: 
        doc_id - file id
        lucene_index_dir - lucene index directory 
    
    Returns: 
        file details as a list 
    '''
    
    store = SimpleFSDirectory(File(lucene_index_dir))
    searcher = IndexSearcher(store, True)
    
    doc = searcher.doc(doc_id)
    table = dict((field.name(), field.stringValue())
                 for field in doc.getFields())
    row = []
    metadata = MetadataType._types
    for field in metadata:
        if table.get(field,'empty') != 'empty' :
            row.append(table.get(field,'empty'))
        else: 
            row.append('')
    row.append(str(table.get(MetadataType.FILE_ID,'empty')))

    return row 
Exemplo n.º 10
0
class OccuredCandidates:
	indexDir = 'data/index'
	max_candidates = 30

	def __init__(self):
		lucene.initVM()
		self._lversion = Version.LUCENE_30
		self._analyzer = EnglishAnalyzer(self._lversion)
		self._searcher = IndexSearcher(SimpleFSDirectory(File(self.indexDir)))

		self._translation = loadTranslation()
		self._links = loadLinks()

	def find(self, phrase):
		phrase = phrase.lower().encode('utf8')
		query = ' '.join(['+'+ word for word in phrase.split(' ')]);
		query = QueryParser(self._lversion, 'contents', self._analyzer).parse(query)
		hits = self._searcher.search(query, self.max_candidates)

		# if not hits.totalHits: print "%d documents for '%s'" % (hits.totalHits, str(query)) # potential bug

		# todo put article_id in lucene index instead of translating document title

		links = {}
		for hit in hits.scoreDocs:
			title = quote(self._searcher.doc(hit.doc).get("title").encode('utf-8').replace(' ', '_')).replace('%28', '(').replace('%29', ')')
			if title in self._translation:
				links[self._translation[title]] = hit.score
			# else: print title # potential bug

		return self._links[phrase].get(-1, 0), links

	def clear_links(self, annotations):
		return filter(lambda annotation: annotation['links'] and max(annotation['links'].values()) > 1, annotations)
Exemplo n.º 11
0
def lucene_search(index_dir, limit, query_text):
    '''
    lucene_search: Search a built index and return upto limit number of responses 
    Arguments: Input index folder, limit value of results returned, query(as string)
    Returns: paths of responsive files as list
    '''
    
    logging.basicConfig(file=os.path.join(index_dir,"lucene_search.log"))
    logger.info("Initializing search....")
    lucene.initVM()
    logger.info("Reading index from "+index_dir)
    index = SimpleFSDirectory(File(index_dir))
    analyzer = StandardAnalyzer(Version.LUCENE_30) #Lucene version used to generate index
    searcher = IndexSearcher(index)
    
    logger.info("Parsing query :"+ query_text)
    query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(query_text)
    hits = searcher.search(query, limit)

    logger.info("Found %d document(s) that matched query '%s':" % (hits.totalHits, query))
    hit_paths = []

    for hit in hits.scoreDocs:
        # The following code also generates score for responsive/found documents and the 
        # content index which matched
        # print hit.score, hit.doc, hit.toString()
        doc = searcher.doc(hit.doc)
        hit_paths.append(doc.get("path"))
    
    return hit_paths 
Exemplo n.º 12
0
def luceneRetriver(query):

	lucene.initVM()

	indir = SimpleFSDirectory(File(INDEXDIR))

	lucene_analyzer = StandardAnalyzer(Version.LUCENE_30)

	lucene_searcher = IndexSearcher(indir)

	my_query = QueryParser(Version.LUCENE_30,"text",\

	lucene_analyzer).parse(query)

	MAX = 1000

	total_hits = lucene_searcher.search(my_query,MAX)

	print "Hits: ",total_hits.totalHits

	for hit in total_hits.scoreDocs:

		print "Hit Score: ",hit.score, "Hit Doc:",hit.doc, "Hit String:",hit.toString()

		doc = lucene_searcher.doc(hit.doc)

		print doc.get("text").encode("utf-8")
Exemplo n.º 13
0
 def query(self,title):
     self._th.attachCurrentThread()
     searcher = IndexSearcher(self._dir)
     query=QueryParser(Version.LUCENE_30, "title", self._analyzer).parse(title)
     total_hits = searcher.search(query, 10)
     for hit in total_hits.scoreDocs:
         doc = (searcher.doc(hit.doc))
         return doc.get("title")+"\n"+doc.get("content")+"--------------------------------"
     return "None"
Exemplo n.º 14
0
def search(command=command1):
    searcher = IndexSearcher(reader)
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    query = QueryParser(Version.LUCENE_CURRENT, "contents",
                        analyzer).parse(command)
    res = searcher.search(query, 1000000)
    print 'Total hits:', res.totalHits
#    return searcher, res
    return [searcher.doc(doc.doc) for doc in res.scoreDocs[:20]]
Exemplo n.º 15
0
    def find(self, query, indir):
        lucene.initVM()
        INDEXDIR = indir

        indir = SimpleFSDirectory(File(INDEXDIR))
        lucene_analyzer = StandardAnalyzer(Version.LUCENE_30)
        lucene_searcher = IndexSearcher(indir)
        my_query = QueryParser(Version.LUCENE_30,"<default field>",\
        lucene_analyzer).parse("text:" + query + " OR title:" + query)
        MAX = 1000
        total_hits = lucene_searcher.search(my_query, MAX)
        print "\nHits: ", total_hits.totalHits, "\n"

        for hit in total_hits.scoreDocs:
            print "Hit Score:", "%.4f" % hit.score, "Department:", lucene_searcher.doc(
                hit.doc).get("department").encode(
                    "utf-8"), "Title:", lucene_searcher.doc(
                        hit.doc).get("title").encode("utf-8")
            print lucene_searcher.doc(hit.doc).get("url").encode("utf-8"), '\n'
Exemplo n.º 16
0
 def getCrowds(self, query, field = CrowdFields.text): 
     searcher = IndexSearcher(self.index, True)
     q = QueryParser(Version.LUCENE_CURRENT, field, self.analyzer).parse(query)
     collector = TopScoreDocCollector.create(hitsPerPage, True)
     searcher.search(q, collector)
     hits = collector.topDocs().scoreDocs
     
     return [
         searcher.doc(scoreDoc.doc).get(CrowdFields.id)
         for scoreDoc in hits]
Exemplo n.º 17
0
class LuceneSearch(object):
    def __init__(self):
        STORE_DIR = "index"
        initVM()
        print 'lucene', VERSION
        self.directory = SimpleFSDirectory(File(STORE_DIR))
        print self.directory
        self.searcher = IndexSearcher(self.directory, True)
        self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    def close(self):
        self.searcher.close()
    
    def raw_search(self, query_string):
        query = QueryParser(Version.LUCENE_CURRENT, "contents",
                            self.analyzer).parse(query_string)
        scoreDocs = self.searcher.search(query, 50).scoreDocs
        print "%s total matching documents." % len(scoreDocs)
        matches = []
        for scoreDoc in scoreDocs:
            doc = self.searcher.doc(scoreDoc.doc)
            #print 'doc matched = ', dir(doc)
            contents = LuceneDoc.load(doc.get('name'))
            matches.append({'contents' : contents, 'doc' : doc})
        return matches
           
    def search(self, query):
        matches = self.raw_search(query)
        results = ''
        if len(matches) > 0:
            results += str(len(matches))+" results <br/>"
            for match in matches:
                results += '<a href='+str(match['contents']['dealUrl'])+'>'+str(match['contents']['merchant'])+'</a><br />'
                results += '<p>'+str(match['contents']['shortAnnouncementTitle'])+','+str(match['contents']['redemptionLocation'])+'</p><br/>'
        else:
            results = "0 results <br/>"
        return results
        
    def cli_search(self):
        while True:
            print
            print "Hit enter with no input to quit."
            command = raw_input("Query:")
            if command == '':
                return
            matches = self.raw_search(command)
            print
            print "Searching for:", command
            
            for match in matches:
                print match['contents']['dealUrl']
                print match['contents']['merchant'], ',', match['contents']['redemptionLocation'], ', ', match['contents']['div']
                print match['contents']['shortAnnouncementTitle']
                print '-'*80
Exemplo n.º 18
0
    def getCrowds(self, query, field=CrowdFields.text):
        searcher = IndexSearcher(self.index, True)
        q = QueryParser(Version.LUCENE_CURRENT, field,
                        self.analyzer).parse(query)
        collector = TopScoreDocCollector.create(hitsPerPage, True)
        searcher.search(q, collector)
        hits = collector.topDocs().scoreDocs

        return [
            searcher.doc(scoreDoc.doc).get(CrowdFields.id) for scoreDoc in hits
        ]
Exemplo n.º 19
0
def pesquisar_com_lucene():
    initVM()
    #print 'lucene', VERSION

    # Get handle to index directory
    directory = SimpleFSDirectory(File(STORE_DIR))

    # Creates a searcher searching the provided index.
    ireader  = IndexReader.open(directory, True)

    # Implements search over a single IndexReader.
    # Use a single instance and use it across queries
    # to improve performance.
    searcher = IndexSearcher(ireader)

    # Get the analyzer
    analyzer = EnglishAnalyzer(Version.LUCENE_CURRENT)

    for query in querys:
        query_number =  query.query_number
        # Constructs a query parser. We specify what field to search into.
        query.query_text = query.query_text.replace('?','')
        query.query_text = query.query_text.replace('*','')
        queryParser = QueryParser(Version.LUCENE_CURRENT,
                                  FIELD_CONTENTS, analyzer)

        # Create the query
        query = queryParser.parse(query.query_text)

        # Run the query and get top 50 results
        topDocs = searcher.search(query,50000)

        # Get top hits
        scoreDocs = topDocs.scoreDocs

        r = resultado_query(query_number,scoreDocs)
        resultados.append(r)
        #print "%s total matching documents." % len(scoreDocs)
        #for scoreDoc in scoreDocs:
        #    doc = searcher.doc(scoreDoc.doc)
        #    print doc.get(FIELD_PATH)

    with open('resultados_da_busca/resultados.csv', 'w') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=';',
                                quotechar='|', quoting=csv.QUOTE_MINIMAL)
        for row in resultados:
            resultados_da_row = []
            i = 1
            for resultado_da_query in row.query_results:
                doc = searcher.doc(resultado_da_query.doc)
                resultados_da_row.append((i,int(doc.get(FIELD_PATH))))
                i = i + 1
            spamwriter.writerow([row.query_number,resultados_da_row])
Exemplo n.º 20
0
def search_image(command):
    if command == ' ':
        return []
    Docs = []
    vm_env = getVMEnv()
    vm_env.attachCurrentThread()
    STORE_DIR = "index_img"

    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(directory, True)
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)

    command_dict = {}
    allowed_opt = ['site']
    opt = 'contents'
    for i in command.split(' '):
        if ':' in i:
            opt, value = i.split(':')[:2]
            opt = opt.lower()
            if opt in allowed_opt and value != '':
                command_dict[opt] = command_dict.get(opt, '') + ' ' + value
        else:
            seg_list = jieba.cut(i)
            command_dict[opt] = command_dict.get(opt,
                                                 '') + ' ' + " ".join(seg_list)

    querys = BooleanQuery()
    for k, v in command_dict.iteritems():
        if k == 'site':
            t = Term(k, '*' + v)
            query = WildcardQuery(t)
        else:
            query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
        querys.add(query, BooleanClause.Occur.MUST)

    scoreDocs = searcher.search(querys, 10000).scoreDocs
    formatter = SimpleHTMLFormatter("<font color=#FF0000>", "</font>")
    highlighter = Highlighter(formatter, QueryScorer(querys))

    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        doc_dic = {}
        doc_dic["url"] = doc.get("url")
        doc_dic["imgurl"] = doc.get("imgurl")
        doc_dic["urltitle"] = doc.get("urltitle")
        text = doc.get("contents")
        ts = analyzer.tokenStream(doc.get("contents"), StringReader(text))
        doc_dic["contents"] = highlighter.getBestFragments(ts, text, 2, "...")
        Docs.append(doc_dic)
    searcher.close()
    return Docs
Exemplo n.º 21
0
def search_lucene_index(search_params, index_dir, index_metadata,
                        records_per_page):
    """
    Uses the query term provided to search the disease ontology lucene index
    """
    results = []

    index_dir = SimpleFSDirectory(File(index_dir))
    analyzer = build_perfield_analyzer(index_metadata)
    searcher = IndexSearcher(index_dir)
    index_fields = index_metadata.keys()

    # Since we are paging results we want to grab what page we are on
    page = (int(search_params.get('page', 1))) - 1

    # Doing something pretty hacky here since we are trying to move from 0-based to 1
    # based indexing to match our pagingation display
    offset = int(page) * records_per_page

    # If we are executing an advanced search we will be building a BooleanQuery
    # in parts as opposed to the one MultiFieldQueryParser when doing a basic
    # serach
    query = None

    if search_params.get('adv_search') == "True":
        query = build_advanced_search_query(search_params,
                                            search_params.get('operator'),
                                            analyzer)
    else:
        parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, index_fields,
                                       analyzer)
        query = MultiFieldQueryParser.parse(
            parser, process_query_param((search_params.get('q'))))

    # Parse through our hits
    hits = searcher.search(query, 10000)
    total_hits = hits.totalHits
    count = min(hits.totalHits - offset, records_per_page)

    for i in xrange(0, count):
        score_doc = hits.scoreDocs[offset + i]
        doc = searcher.doc(score_doc.doc)
        term_id = doc.get('term id')
        name = doc.get('name')
        explain = searcher.explain(query, score_doc.doc)
        match_fields = get_field_matches(explain.toString(), index_fields)

        results.append((term_id, name, list(match_fields)))

    searcher.close()
    return (results, total_hits)
Exemplo n.º 22
0
def retrieve_document_details(docid, index_dir):
    '''
    This method will be used to retrieve a single document associated with the docid 
    that is passed to it as parameter. 
    The document will be searched in the directory referred by index_dir.
    
    If you want to access a specific field's value you can access that using the instance 
    of this document class as document.get(<field_name>). Here <field_name> is a string.
    '''
    
    store = SimpleFSDirectory(File(index_dir))
    searcher = IndexSearcher(store, True)
    document = searcher.doc(int(docid))
    return document
Exemplo n.º 23
0
    def testQueryParser(self):

        searcher = IndexSearcher(self.directory, True)

        query = QueryParser(Version.LUCENE_CURRENT, "contents",
                            SimpleAnalyzer()).parse("+JUNIT +ANT -MOCK")
        scoreDocs = searcher.search(query, 50).scoreDocs
        self.assertEqual(1, len(scoreDocs))
        d = searcher.doc(scoreDocs[0].doc)
        self.assertEqual("Java Development with Ant", d.get("title"))

        query = QueryParser(Version.LUCENE_CURRENT, "contents",
                            SimpleAnalyzer()).parse("mock OR junit")
        scoreDocs = searcher.search(query, 50).scoreDocs
        self.assertEqual(2, len(scoreDocs), "JDwA and JIA")
Exemplo n.º 24
0
    def testQueryParser(self):

        searcher = IndexSearcher(self.directory, True)

        query = QueryParser(Version.LUCENE_CURRENT, "contents",
                            SimpleAnalyzer()).parse("+JUNIT +ANT -MOCK")
        scoreDocs = searcher.search(query, 50).scoreDocs
        self.assertEqual(1, len(scoreDocs))
        d = searcher.doc(scoreDocs[0].doc)
        self.assertEqual("Java Development with Ant", d.get("title"))

        query = QueryParser(Version.LUCENE_CURRENT, "contents",
                            SimpleAnalyzer()).parse("mock OR junit")
        scoreDocs = searcher.search(query, 50).scoreDocs
        self.assertEqual(2, len(scoreDocs), "JDwA and JIA")
Exemplo n.º 25
0
    def testSecurityFilter(self):

        query = TermQuery(Term("keywords", "info"))

        searcher = IndexSearcher(self.directory, True)
        topDocs = searcher.search(query, 50)
        self.assertEqual(2, topDocs.totalHits, "Both documents match")

        jakeFilter = QueryWrapperFilter(TermQuery(Term("owner", "jake")))

        scoreDocs = searcher.search(query, jakeFilter, 50).scoreDocs
        self.assertEqual(1, len(scoreDocs))
        self.assertEqual("jakes sensitive info",
                         searcher.doc(scoreDocs[0].doc).get("keywords"),
                         "elwood is safe")
Exemplo n.º 26
0
    def testSecurityFilter(self):

        query = TermQuery(Term("keywords", "info"))

        searcher = IndexSearcher(self.directory, True)
        topDocs = searcher.search(query, 50)
        self.assertEqual(2, topDocs.totalHits, "Both documents match")

        jakeFilter = QueryWrapperFilter(TermQuery(Term("owner", "jake")))

        scoreDocs = searcher.search(query, jakeFilter, 50).scoreDocs
        self.assertEqual(1, len(scoreDocs))
        self.assertEqual("jakes sensitive info",
                         searcher.doc(scoreDocs[0].doc).get("keywords"),
                         "elwood is safe")
Exemplo n.º 27
0
    def search(cls, indexDir, q):

        fsDir = SimpleFSDirectory(File(indexDir))
        searcher = IndexSearcher(fsDir, True)

        query = QueryParser(Version.LUCENE_CURRENT, "contents",
                            StandardAnalyzer(Version.LUCENE_CURRENT)).parse(q)
        start = time()
        hits = searcher.search(query, 50).scoreDocs
        duration = timedelta(seconds=time() - start)

        print "Found %d document(s) (in %s) that matched query '%s':" %(len(hits), duration, q)

        for hit in hits:
            doc = searcher.doc(hit.doc)
            print 'path:', doc.get("path")
Exemplo n.º 28
0
def luceneRetriver(queryString):
	lucene.initVM()
	#指明索引所处位置
	indexDir = "C:\index"
	dir = SimpleFSDirectory(File(indexDir))
	analyzer = StandardAnalyzer(Version.LUCENE_30)
	searcher = IndexSearcher(dir)
	query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(queryString)
	MAX = 1000
	#最多记录数
	hits = searcher.search(query, MAX)
	print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query)
	for hit in hits.scoreDocs:
		print hit.score, hit.doc, hit.toString()
		doc = searcher.doc(hit.doc)
		print doc.get("path").encode("utf-8")
Exemplo n.º 29
0
def query(query):
    lucene.initVM()
    indexDir = "/Tmp/REMOVEME.index-dir"
    dir = SimpleFSDirectory(File(indexDir))
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    searcher = IndexSearcher(dir)

    query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(query)
    MAX = 1000
    hits = searcher.search(query, MAX)

    print "Found %d document(s) that matched query '%s':" % (hits.totalHits,
                                                             query)
    for hit in hits.scoreDocs:
        print hit.score, hit.doc, hit.toString(), doc_urls[str(hit.doc)]
        doc = searcher.doc(hit.doc)
Exemplo n.º 30
0
def boolean_search_lucene_index(index_dir, query_text, limit):
    '''
    This function searches a boolean query in the learned lucene index 
    
    Arguments: 
        index_dir - the lucene index directory 
        query_text - the query text which follows http://lucene.apache.org/core/3_6_0/queryparsersyntax.html
        limit - the number of records to be retrieved 
    Return: 
        rows - the returned document details 

    
    '''
    DEFAULT_QUERY_FIELD = 'all'
    
    
    store = SimpleFSDirectory(File(index_dir))
    
    searcher = IndexSearcher(store, True)
    parser = QueryParser(Version.LUCENE_CURRENT, DEFAULT_QUERY_FIELD, STD_ANALYZER)
    query = parser.parse(query_text)
    
    start = datetime.datetime.now()
    scoreDocs = searcher.search(query, limit).scoreDocs
    duration = datetime.datetime.now() - start
    
    # print "Lucene Search: Found %d document(s) (in %s) that matched query '%s':" %(len(scoreDocs), duration, query)

    
    rows = []
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        table = dict((field.name(), field.stringValue())
                     for field in doc.getFields())
        row = []
        metadata = MetadataType._types
        for field in metadata:
            if table.get(field,'empty') != 'empty' :
                row.append(table.get(field,'empty'))
            else: 
                row.append('')
        row.append(str(table.get(MetadataType.FILE_ID,'empty'))) # the unique file id of a file 
        row.append(scoreDoc.score)
        
        rows.append(row)
    
    return rows
Exemplo n.º 31
0
def search_lucene_index(search_params, index_dir, index_metadata, records_per_page):
    """
    Uses the query term provided to search the disease ontology lucene index
    """
    results = []

    index_dir = SimpleFSDirectory(File(index_dir))
    analyzer = build_perfield_analyzer(index_metadata)
    searcher = IndexSearcher(index_dir)
    index_fields = index_metadata.keys()

    # Since we are paging results we want to grab what page we are on   
    page = (int(search_params.get('page', 1))) - 1

    # Doing something pretty hacky here since we are trying to move from 0-based to 1 
    # based indexing to match our pagingation display
    offset = int(page) * records_per_page

    # If we are executing an advanced search we will be building a BooleanQuery
    # in parts as opposed to the one MultiFieldQueryParser when doing a basic
    # serach
    query = None
    
    if search_params.get('adv_search') == "True":
        query = build_advanced_search_query(search_params, search_params.get('operator'), analyzer)
    else:
        parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, index_fields, analyzer)
        query = MultiFieldQueryParser.parse(parser, process_query_param((search_params.get('q'))))

    # Parse through our hits
    hits = searcher.search(query, 10000)
    total_hits = hits.totalHits
    count = min(hits.totalHits - offset, records_per_page)

    for i in xrange(0, count):
        score_doc = hits.scoreDocs[offset+i]
        doc = searcher.doc(score_doc.doc)
        term_id = doc.get('term id')
        name = doc.get('name')
        explain = searcher.explain(query, score_doc.doc)
        match_fields = get_field_matches( explain.toString(), index_fields )

        results.append( (term_id, name, list(match_fields)) )

    searcher.close()
    return (results, total_hits)
Exemplo n.º 32
0
    def search(cls, indexDir, q):

        fsDir = SimpleFSDirectory(File(indexDir))
        searcher = IndexSearcher(fsDir, True)

        query = QueryParser(Version.LUCENE_CURRENT, "contents",
                            StandardAnalyzer(Version.LUCENE_CURRENT)).parse(q)
        start = time()
        hits = searcher.search(query, 50).scoreDocs
        duration = timedelta(seconds=time() - start)

        print "Found %d document(s) (in %s) that matched query '%s':" % (
            len(hits), duration, q)

        for hit in hits:
            doc = searcher.doc(hit.doc)
            print 'path:', doc.get("path")
Exemplo n.º 33
0
    def delete_old(self, index):
        existing_ids = set([book.id for book in Book.objects.all()])

        reader = IndexReader.open(index.index, False)
        searcher = IndexSearcher(reader)
        try:
            num = searcher.docFreq(Term('is_book', 'true'))
            docs = searcher.search(Search.make_term_query(['true'], 'is_book'), num)
            for result in docs.scoreDocs:
                stored = searcher.doc(result.doc)
                book_id = int(stored.get('book_id'))
                if not book_id in existing_ids:
                    print "book id %d doesn't exist." % book_id
                    index.remove_book(book_id)
        finally:
            searcher.close()
            reader.close()
Exemplo n.º 34
0
class WordNetSynonymEngine(object):
    def __init__(self, indexDir):

        self.directory = RAMDirectory(SimpleFSDirectory(File(indexDir)))
        self.searcher = IndexSearcher(self.directory)

    def getSynonyms(self, word):

        synList = []
        topDocs = self.searcher.search(TermQuery(Term("word", word)), 50)

        for scoreDoc in topDocs.scoreDocs:
            doc = self.searcher.doc(scoreDoc.doc)
            for value in doc.getValues("syn"):
                synList.append(value)

        return synList
Exemplo n.º 35
0
    def testFuzzy(self):

        self.indexSingleFieldDocs([Field("contents", "fuzzy", Field.Store.YES,
                                         Field.Index.ANALYZED),
                                   Field("contents", "wuzzy", Field.Store.YES,
                                         Field.Index.ANALYZED)])

        searcher = IndexSearcher(self.directory)
        query = FuzzyQuery(Term("contents", "wuzza"))
        scoreDocs = searcher.search(query, 50).scoreDocs
        self.assertEqual(2, len(scoreDocs), "both close enough")

        self.assert_(scoreDocs[0].score != scoreDocs[1].score,
                     "wuzzy closer than fuzzy")
        self.assertEqual("wuzzy",
                         searcher.doc(scoreDocs[0].doc).get("contents"),
                         "wuzza bear")
Exemplo n.º 36
0
    def testHits(self):

        searcher = IndexSearcher(self.directory, True)
        query = TermQuery(Term("title", "action"))
        scoreDocs = searcher.search(query, 50).scoreDocs

        scorer = QueryScorer(query)
        highlighter = Highlighter(scorer)

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            title = doc["title"]
            stream = SimpleAnalyzer(Version.LUCENE_CURRENT).tokenStream(
                "title", StringReader(title))
            fragment = highlighter.getBestFragment(stream, title)

            print fragment
Exemplo n.º 37
0
    def testFuzzy(self):

        self.indexSingleFieldDocs([Field("contents", "fuzzy", Field.Store.YES,
                                         Field.Index.ANALYZED),
                                   Field("contents", "wuzzy", Field.Store.YES,
                                         Field.Index.ANALYZED)])

        searcher = IndexSearcher(self.directory)
        query = FuzzyQuery(Term("contents", "wuzza"))
        scoreDocs = searcher.search(query, 50).scoreDocs
        self.assertEqual(2, len(scoreDocs), "both close enough")

        self.assert_(scoreDocs[0].score != scoreDocs[1].score,
                     "wuzzy closer than fuzzy")
        self.assertEqual("wuzzy",
                         searcher.doc(scoreDocs[0].doc).get("contents"),
                         "wuzza bear")
Exemplo n.º 38
0
	def doSearch(self,searchString,fieldToSearch,luceneDir):
		searchResult =[]
		store =	SimpleFSDirectory(File(luceneDir))
		analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
		searcher = IndexSearcher(store)
		query = QueryParser(Version.LUCENE_CURRENT,fieldToSearch,analyzer).parse(searchString)
		hits = searcher.search(query,self.MAX)
		
		print "Found %d documents that matched the query '%s'" %(hits.totalHits,searchString)
		for hit in hits.scoreDocs:
			doc = searcher.doc(hit.doc)
			#docdict['score'] = hit.score
			#docdict['docid'] = hit.doc
			#docdict['content'] = doc.get("contents").encode("utf-8")
			searchResult.append([doc.get("title").encode("utf-8"),doc.get("contents").encode("utf-8")])
		searcher.close()	
		return searchResult
Exemplo n.º 39
0
def run(writer, analyzer):
	while True:
		print 
		print "Hit enter with no input to quit."
		command = raw_input("Query:")
		if command == '':
			return

		print "Searching for:", command
		IndexReader = writer.getReader()
		searcher = IndexSearcher(IndexReader)
		query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(command)
		scoreDocs = searcher.search(query, 50).scoreDocs
		print "%s total matching documents." % len(scoreDocs)

		for scoreDoc in scoreDocs:
			doc = searcher.doc(scoreDoc.doc)
			print 'path:', doc.get("path"), 'name:', doc.get("name")
Exemplo n.º 40
0
class WordNetSynonymEngine(object):

    def __init__(self, indexDir):

        self.directory = RAMDirectory(SimpleFSDirectory(File(indexDir)))
        self.searcher = IndexSearcher(self.directory)

    def getSynonyms(self, word):

        synList = []
        topDocs = self.searcher.search(TermQuery(Term("word", word)), 50)
        
        for scoreDoc in topDocs.scoreDocs:
            doc = self.searcher.doc(scoreDoc.doc)
            for value in doc.getValues("syn"):
                synList.append(value)

        return synList
Exemplo n.º 41
0
def retrieve(string,tweetID):
 global eventNum
 global eventDict
 global eventList
 lucene.initVM()
 indexDir = "REMOVEME.index-dir"
 dir = SimpleFSDirectory(File(indexDir))
 analyzer = StandardAnalyzer(Version.LUCENE_30)
 try:
  searcher = IndexSearcher(dir)
 except lucene.JavaError:
  #print 'Inside First Except'
#index(string)
  eventDict[tweetID] = eventNum
  eventNum = eventNum + 1
  return
#searcher = IndexSearcher(dir)
 try:
  query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(string)
#e = sys.exc_info()[0]
#print e
  MAX = 2
  hits = searcher.search(query, MAX)
#print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query)

  if hits.totalHits > 0:
#print hits.scoreDocs[0].doc
        #print hits.scoreDocs[0].doc
	#print tweetID
	print tweetID, hits.scoreDocs[0].doc
	eventDict[tweetID] = eventDict[hits.scoreDocs[0].doc]
  elif hits.totalHits == 0:
  	eventDict[tweetID] = eventNum
	eventNum = eventNum + 1

  for hit in hits.scoreDocs:
#print hit.score, hit.doc, hit.toString()
      doc = searcher.doc(hit.doc)
#print doc.get("text").encode("utf-8")
 except lucene.JavaError:
  pass
Exemplo n.º 42
0
def search_lucene_index(index_dir, query_model, limit):
    '''
    This function searches query model (query terms along with their 
    meta data) in the learned lucene index
    
    Arguments: 
        index_dir - the lucene index directory 
        query_model - the query model (contains query terms, meta data, and conjunctions) 
        limit - the number of records to be retrieved 
    Return: 
        rows - the returned document details 

    
    '''
    store = SimpleFSDirectory(File(index_dir))
    searcher = IndexSearcher(store, True)
    parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, query_model[1], STD_ANALYZER)
    query = parser.parse(Version.LUCENE_CURRENT, query_model[0], query_model[1], query_model[2], STD_ANALYZER)
    scoreDocs = searcher.search(query, limit).scoreDocs
    
    print "Found %d document(s) that matched query '%s':" %(len(scoreDocs), query)
    
    rows = []
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        table = dict((field.name(), field.stringValue())
                     for field in doc.getFields())
        row = []
        metadata = MetadataType._types
        for field in metadata:
            if table.get(field,'empty') != 'empty' :
                row.append(table.get(field,'empty'))
            else: 
                row.append('')
        row.append(str(table.get(MetadataType.FILE_ID,'empty'))) # the unique file id of a file 
        row.append(scoreDoc.score)
        
        rows.append(row)
    
    return rows
Exemplo n.º 43
0
class OccuredCandidates:
    indexDir = 'data/index'
    max_candidates = 30

    def __init__(self):
        lucene.initVM()
        self._lversion = Version.LUCENE_30
        self._analyzer = EnglishAnalyzer(self._lversion)
        self._searcher = IndexSearcher(SimpleFSDirectory(File(self.indexDir)))

        self._translation = loadTranslation()
        self._links = loadLinks()

    def find(self, phrase):
        phrase = phrase.lower().encode('utf8')
        query = ' '.join(['+' + word for word in phrase.split(' ')])
        query = QueryParser(self._lversion, 'contents',
                            self._analyzer).parse(query)
        hits = self._searcher.search(query, self.max_candidates)

        # if not hits.totalHits: print "%d documents for '%s'" % (hits.totalHits, str(query)) # potential bug

        # todo put article_id in lucene index instead of translating document title

        links = {}
        for hit in hits.scoreDocs:
            title = quote(
                self._searcher.doc(
                    hit.doc).get("title").encode('utf-8').replace(
                        ' ', '_')).replace('%28', '(').replace('%29', ')')
            if title in self._translation:
                links[self._translation[title]] = hit.score
            # else: print title # potential bug

        return self._links[phrase].get(-1, 0), links

    def clear_links(self, annotations):
        return filter(
            lambda annotation: annotation['links'] and max(annotation[
                'links'].values()) > 1, annotations)
Exemplo n.º 44
0
def luceneRetriver(query):
    #print ('-------------Searching-------------')
    #print (query)
    lucene.initVM()
    indir = SimpleFSDirectory(File(INDEXDIR))
    lucene_analyzer = StandardAnalyzer(Version.LUCENE_30)
    lucene_searcher = IndexSearcher(indir)
    my_query = QueryParser(Version.LUCENE_30, 'text',
                           lucene_analyzer).parse(query)
    MAX = 1000

    #存放返回的文档标题list
    title_list = []

    total_hits = lucene_searcher.search(my_query, MAX)

    #print "Hits: ", total_hits.totalHits

    for hit in total_hits.scoreDocs[:10]:

        #print"Hit Score: ", hit.score, "Hit Doc:", hit.doc, "HitString:", hit.toString()

        doc = lucene_searcher.doc(hit.doc)

        #print doc.get("title").encode("utf-8").lstrip(str(TXTDIR))
        #print doc.get("text").encode("utf-8")
        #print ('\n')

        title_list.append({
            doc.get("title").encode("utf-8").lstrip(str(TXTDIR)):
            round(hit.score, 5)
        }.copy())

    return title_list


#print ('查询内容:八卦')
#print ('查询结果:')
#print ('\n')
#luceneRetriver("下列 关于 中国 八卦 不正确 人类 历史 东西方 平等 交流 见证")
Exemplo n.º 45
0
def test_search(index_dir):
    '''
    The test function to test the created index 
    '''
    store = SimpleFSDirectory(File(index_dir))
   
    searcher = IndexSearcher(store, True)
    parser = QueryParser(Version.LUCENE_CURRENT, "keywords", STD_ANALYZER)
    parser.setDefaultOperator(QueryParser.Operator.AND)
    query = parser.parse('email_subject:Training')
    start = datetime.datetime.now()
    scoreDocs = searcher.search(query, 50).scoreDocs
    duration = datetime.datetime.now() - start
    
    print "Found %d document(s) (in %s) that matched query '%s':" %(len(scoreDocs), duration, query)
    
    
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        print scoreDoc.score
        table = dict((field.name(), field.stringValue())
                     for field in doc.getFields())
        print table
Exemplo n.º 46
0
    def main(cls, argv):

        if len(argv) != 3:
            print "Usage: Explainer <index dir> <query>"

        else:
            indexDir = argv[1]
            queryExpression = argv[2]

            directory = SimpleFSDirectory(indexDir)
            query = QueryParser(Version.LUCENE_CURRENT, "contents",
                                SimpleAnalyzer()).parse(queryExpression)

            print "Query:", queryExpression

            searcher = IndexSearcher(directory)
            scoreDocs = searcher.search(query, 50).scoreDocs

            for scoreDoc in scoreDocs:
                doc = searcher.doc(scoreDoc.doc)
                explanation = searcher.explain(query, scoreDoc.doc)
                print "----------"
                print doc["title"].encode('utf-8')
                print explanation
Exemplo n.º 47
0
    def main(cls, argv):

        if len(argv) != 3:
            print "Usage: Explainer <index dir> <query>"

        else:
            indexDir = argv[1]
            queryExpression = argv[2]

            directory = SimpleFSDirectory(File(indexDir))
            query = QueryParser(Version.LUCENE_CURRENT, "contents",
                                SimpleAnalyzer()).parse(queryExpression)

            print "Query:", queryExpression

            searcher = IndexSearcher(directory)
            scoreDocs = searcher.search(query, 50).scoreDocs

            for scoreDoc in scoreDocs:
                doc = searcher.doc(scoreDoc.doc)
                explanation = searcher.explain(query, scoreDoc.doc)
                print "----------"
                print doc["title"].encode('utf-8')
                print explanation
Exemplo n.º 48
0
        writer.addDocument(doc)

    print("Indexed lines from stdin (%d documents in index)" %
          (writer.numDocs()))
    print("About to optimize index of %d documents..." % writer.numDocs())
    writer.optimize()
    print("...done optimizing index of %d documents" % writer.numDocs())
    print("Closing index of %d documents..." % writer.numDocs())
    print("...done closing index of %d documents" % writer.numDocs())
    writer.close()

    # RETRIEVAL

    dir = SimpleFSDirectory(File(fullIndexDir))
    analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT)
    searcher = IndexSearcher(dir)

    query = QueryParser(lucene.Version.LUCENE_CURRENT, "text",
                        analyzer).parse(u"¿Dónde está La Mancha?")
    MAX = 1000
    hits = searcher.search(query, MAX)

    print("Found %d document(s) that matched query '%s':" %
          (hits.totalHits, query))

    for hit in hits.scoreDocs:
        print(hit.score, hit.doc, hit.toString())
        doc = searcher.doc(hit.doc)
        print(doc.get("text").encode("utf-8"))
        print(doc.get("metadata").encode("utf-8"))
Exemplo n.º 49
0
    # Get the analyzer
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    # Constructs a query parser.
    queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS, analyzer)

    # Create a query
    query = queryParser.parse(QUERY_STRING)

    topDocs = searcher.search(query, 50)

    # Get top hits
    scoreDocs = topDocs.scoreDocs
    print "%s total matching documents." % len(scoreDocs)

    HighlightFormatter = SimpleHTMLFormatter()
    query_score = QueryScorer (query)

    highlighter = Highlighter(HighlightFormatter, query_score)

    # Set the fragment size. We break text in to fragment of 64 characters
    fragmenter  = SimpleSpanFragmenter(query_score, 64)
    highlighter.setTextFragmenter(fragmenter)

    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        text = doc.get(FIELD_CONTENTS)
        ts = analyzer.tokenStream(FIELD_CONTENTS, StringReader(text))
        print doc.get(FIELD_PATH)
        print highlighter.getBestFragments(ts, text, 3, "...")
    print ""
Exemplo n.º 50
0
def main(indexDir, inputDir):
    """Creates a SQLite database with news linked to other news by at least one term, backed by a Lucene Index"""
    lucene.initVM()

    # Open index
    logger.info("Opening Lucene index [%s]..." % indexDir)
    dir = SimpleFSDirectory(File(indexDir))
    analyzer = KeywordAnalyzer(Version.LUCENE_CURRENT)
    searcher = IndexSearcher(dir)

    # Search documents
    onlyfiles = [
        f for f in listdir(inputDir)
        if isfile(join(inputDir, f)) and f.endswith('.json')
    ]
    for f in onlyfiles:
        json_data = open(inputDir + '/' + f)
        data = json.load(json_data)
        # The results collected after comparison
        results = list()

        journal_code = f.split('.')[0]

        for entry in data:
            url = entry['url']
            date = entry['date']
            title = entry['title']

            logger.debug("Processing URL [%s] date [%s] - [%s]" %
                         (url, date, title))

            tt = nltk.word_tokenize(title)
            tokens = []
            for t in tt:
                tokens.append(t.lower())

            entry['similars'] = list()

            for token in tokens:
                q = 'title: "%s" AND date: "%s" AND NOT journal: "%s" AND NOT url: "%s"' % (
                    token, date, journal_code, url)
                query = QueryParser(Version.LUCENE_CURRENT, "title",
                                    analyzer).parse(q)
                hits = searcher.search(query, MAX_HITS)

                logger.debug("Found %d document(s) that matched query '%s':" %
                             (hits.totalHits, q))

                for hit in hits.scoreDocs:
                    doc = searcher.doc(hit.doc)
                    logger.debug(doc)
                    entry['similars'].append({
                        'token': token,
                        'url': doc.get('url'),
                        'title': doc.get('title')
                    })

            results.append(entry)
        json_data.close()

        print """<html>
	<body>
	<table><thead>
	<tr>
	<th>Jornal</th><th>Data</th><th>T&iacute;tulo</th><th>URL</th><th>Not&iacute;cias semelhantes</th>
	</tr>
	</thead>
	<tbody>
	"""
        for entry in results:
            similars = entry['similars']
            similars_text = '<ul>'
            for s in similars:
                similars_text += '<li>[%s] [%s] [%s]</li>' % (
                    s['token'].encode('iso-8859-1', 'ignore'),
                    s['title'].encode('iso-8859-1', 'ignore'), s['url'].encode(
                        'iso-8859-1', 'ignore'))
            similars_text += '</ul>'
            print """<tr>
	<td>%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td>
	</tr>
	""" % (journal_code, entry['date'].encode('iso-8859-1', 'ignore'),
            entry['title'].encode('iso-8859-1', 'ignore'), entry['url'].encode(
            'iso-8859-1', 'ignore'), similars_text)
Exemplo n.º 51
0
class DistanceSortingTest(TestCase):
    def setUp(self):

        self.directory = RAMDirectory()
        writer = IndexWriter(self.directory, WhitespaceAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        self.addPoint(writer, "El Charro", "restaurant", 1, 2)
        self.addPoint(writer, "Cafe Poca Cosa", "restaurant", 5, 9)
        self.addPoint(writer, "Los Betos", "restaurant", 9, 6)
        self.addPoint(writer, "Nico's Taco Shop", "restaurant", 3, 8)

        writer.close()

        self.searcher = IndexSearcher(self.directory, True)
        self.query = TermQuery(Term("type", "restaurant"))

    def addPoint(self, writer, name, type, x, y):

        doc = Document()
        doc.add(Field("name", name, Field.Store.YES, Field.Index.NOT_ANALYZED))
        doc.add(Field("type", type, Field.Store.YES, Field.Index.NOT_ANALYZED))
        doc.add(
            Field("x", str(x), Field.Store.YES,
                  Field.Index.NOT_ANALYZED_NO_NORMS))
        doc.add(
            Field("y", str(y), Field.Store.YES,
                  Field.Index.NOT_ANALYZED_NO_NORMS))

        writer.addDocument(doc)

    def testNearestRestaurantToHome(self):

        sort = Sort(SortField("location", DistanceComparatorSource(0, 0)))

        scoreDocs = self.searcher.search(self.query, None, 50, sort).scoreDocs
        self.assertEqual("El Charro",
                         self.searcher.doc(scoreDocs[0].doc).get("name"),
                         "closest")
        self.assertEqual("Los Betos",
                         self.searcher.doc(scoreDocs[3].doc).get("name"),
                         "furthest")

    def testNeareastRestaurantToWork(self):

        sort = Sort(SortField("location", DistanceComparatorSource(10, 10)))

        docs = self.searcher.search(self.query, None, 3, sort)
        self.assertEqual(4, docs.totalHits)
        self.assertEqual(3, len(docs.scoreDocs))

        fieldDoc = FieldDoc.cast_(docs.scoreDocs[0])
        distance = Double.cast_(fieldDoc.fields[0]).doubleValue()

        self.assertEqual(sqrt(17), distance, "(10,10) -> (9,6) = sqrt(17)")

        document = self.searcher.doc(fieldDoc.doc)
        self.assertEqual("Los Betos", document["name"])

        self.dumpDocs(sort, docs)

    def dumpDocs(self, sort, docs):

        print "Sorted by:", sort

        for scoreDoc in docs.scoreDocs:
            fieldDoc = FieldDoc.cast_(scoreDoc)
            distance = Double.cast_(fieldDoc.fields[0]).doubleValue()
            doc = self.searcher.doc(fieldDoc.doc)
            print "  %(name)s @ (%(location)s) ->" % doc, distance
Exemplo n.º 52
0
class BooksLikeThis(object):
    def main(cls, argv):

        indexDir = System.getProperty("index.dir")
        directory = SimpleFSDirectory(File(indexDir))

        reader = IndexReader.open(directory, True)
        blt = BooksLikeThis(reader)

        for id in xrange(reader.maxDoc()):
            if reader.isDeleted(id):
                continue
            doc = reader.document(id)
            print ''
            print doc.get("title").encode('utf-8')

            docs = blt.docsLike(id, doc, 10)
            if not docs:
                print "  None like this"
            else:
                for doc in docs:
                    print " ->", doc.get("title").encode('utf-8')

    def __init__(self, reader):

        self.reader = reader
        self.searcher = IndexSearcher(reader)

    def docsLike(self, id, doc, max):

        authors = doc.getValues("author")
        authorQuery = BooleanQuery()
        for author in authors:
            authorQuery.add(TermQuery(Term("author", author)),
                            BooleanClause.Occur.SHOULD)
        authorQuery.setBoost(2.0)

        vector = self.reader.getTermFreqVector(id, "subject")

        subjectQuery = BooleanQuery()
        for term in vector.getTerms():
            tq = TermQuery(Term("subject", term))
            subjectQuery.add(tq, BooleanClause.Occur.SHOULD)

        likeThisQuery = BooleanQuery()
        likeThisQuery.add(authorQuery, BooleanClause.Occur.SHOULD)
        likeThisQuery.add(subjectQuery, BooleanClause.Occur.SHOULD)

        # exclude myself
        likeThisQuery.add(TermQuery(Term("isbn", doc.get("isbn"))),
                          BooleanClause.Occur.MUST_NOT)

        print "  Query:", likeThisQuery.toString("contents")
        scoreDocs = self.searcher.search(likeThisQuery, 50).scoreDocs

        docs = []
        for scoreDoc in scoreDocs:
            doc = self.searcher.doc(scoreDoc.doc)
            if len(docs) < max:
                docs.append(doc)
            else:
                break

        return docs

    main = classmethod(main)
Exemplo n.º 53
0
    def post(self):
        q = self.get_argument("query")
        k = self.get_argument("kTerms")

        # self.write(key)

        # def query(query):
        # query = self.get_argument("q")
        lucene.initVM()
        indexDir = "index"
        dir = SimpleFSDirectory(File(indexDir))
        analyzer = StandardAnalyzer(Version.LUCENE_30)
        searcher = IndexSearcher(dir)

        query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(q)
        MAX = 10
        hits = searcher.search(query, MAX)

        print "Found %d document(s) that matched query '%s':" % (
            hits.totalHits, query)
        items = []
        rQ = []

        #for key, value in doc_urls.iteritems()
        # print (key, value)

        for hit in hits.scoreDocs:
            #items.append({'score':hit.score, 'doc':hit.doc, 'blah':hit.toString(), 'url':doc_urls[str(hit.doc)]})
            print hit.score, hit.doc, hit.toString()
            print(len(doc_urls))
            items.append(doc_urls[str(hit.doc)])
            print(doc_urls[str(hit.doc)])
            doc = searcher.doc(hit.doc)
            print(hit.doc)
            rQ.append("html_files/" + str(hit.doc))

        i = 0
        rqSize = 0
        for url in rQ:
            rqSize = rqSize + 1
            print(url)
            f = codecs.open(url, 'r')
            html = f.read()
            html = html.decode('utf-8')
            tag_free = strip_tags(html)
            path = 'strippedHTML_files'
            if not os.path.exists(path):
                os.makedirs(path)
            filename = str(i)
            with open(os.path.join(path, filename), 'wb') as temp_file:
                temp_file.write(tag_free.encode('utf-8'))
            i = i + 1

        path = 'strippedHTML_files'
        i = 0
        for filename in os.listdir(path):
            with open(os.path.join(path, filename), 'r') as myfile:
                data = myfile.read()
                stripStopWords(data, i)
                i = i + 1
        if k > 0:
            newQuery = calcNewQuery(k, q, rqSize)
            q = newQuery
            print("new query is ")
            print(q)

        self.render("index.html",
                    title="Results",
                    items=items,
                    query=q,
                    kTerms=k)