def retrieve(indexdir, queries):
    lucene.initVM()
    f = open("results_lucene.txt", "w")
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
    reader = IndexReader.open(SimpleFSDirectory(File(indexdir)))
    searcher = IndexSearcher(reader)

    fields = ["title", "abstract", "authors"]

    st = PorterStemmer()
    for id, q in queries.iteritems():
        query = q
        tokenizer = RegexpTokenizer(r'\w+')
        qwords = tokenizer.tokenize(query)
        qwords_k = [st.stem(q) for q in qwords]
        query = " ".join(qwords_k)
        parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields, analyzer)
        parser.setDefaultOperator(QueryParserBase.OR_OPERATOR)
        query = MultiFieldQueryParser.parse(parser, query)
        MAX = 1000
        hits = searcher.search(query, MAX)
        # print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query)
        for i, hit in enumerate(hits.scoreDocs):
            f.write("%s Q0 %s %s %s G17R3\n" % (id, hit.doc+1, i+1, hit.score))
            # print hit.doc+1, hit.score
            # doc = searcher.doc(hit.doc)
            # print doc.get("authors").encode("utf-8")
    f.close()
예제 #2
0
    def _create_query(self, fields):
        """
        Build query with Term, Phrase and Fuzzy clauses.
        :param fields: dictionary of (field, text) tuples
        :return: query
        """
        query = BooleanQuery()
        for (field, text) in fields:
            if field.startswith("year"):
                start, end = text.split(",")
                numeric_query = NumericRangeQuery.newIntRange(
                    'year', int(start), int(end), True, True)
                query.add(BooleanClause(numeric_query, BooleanClause.Occur.MUST))
            if field == 'title':
                spans = []
                for word in text.lower().split():
                    spans.append(SpanTermQuery(Term(field, word)))
                query.add(BooleanClause(SpanNearQuery(spans, 2, True), BooleanClause.Occur.SHOULD))

        field_names, field_texts = zip(*fields)
        flags = [BooleanClause.Occur.MUST] * len(field_names)

        query_parser_query = MultiFieldQueryParser.parse(
            Version.LUCENE_CURRENT,
            field_texts,
            field_names,
            flags,
            StandardAnalyzer(Version.LUCENE_CURRENT))
        query.add(BooleanClause(query_parser_query, BooleanClause.Occur.MUST))

        fuzzify = lambda s: (s + " ").replace(" ", "~1 ")
        fuzzy_field_texts = map(fuzzify, field_texts)

        fuzzy_query_parser_query = MultiFieldQueryParser.parse(
            Version.LUCENE_CURRENT,
            fuzzy_field_texts,
            field_names,
            flags,
            StandardAnalyzer(Version.LUCENE_CURRENT))
        query.add(BooleanClause(fuzzy_query_parser_query, BooleanClause.Occur.MUST))

        boostQuery = FunctionQuery(
            LinearFloatFunction(
                PowFloatFunction(
                    DoubleConstValueSource(0.0001),
                    ScaleFloatFunction(IntFieldSource("imdb_votes_boost"), 0.0, 1.0)
                ), -1.0, 1.0))
        query = CustomScoreQuery(query, boostQuery)

        return query
	def search(self, input_query=None, max_answers=10):
		''' Searches the given query in the index '''
		if input_query is None:
			return None

		base_dir = '.'
		directory = SimpleFSDirectory(File(os.path.join(base_dir, self.index_dir)))
		searcher = IndexSearcher(DirectoryReader.open(directory))
		analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
		
		
		# query = QueryParser(Version.LUCENE_CURRENT, "title", analyzer).parse(input_query)
		parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, (self._posts_fields + self._answer_fields), analyzer)
		query = MultiFieldQueryParser.parse(parser, input_query)

		scoreDocs = searcher.search(query, max_answers).scoreDocs
		print "%s total matching documents." % len(scoreDocs)

		docs = []
		for scoreDoc in scoreDocs:
			doc = searcher.doc(scoreDoc.doc)
			doc_dict = dict((field.name(), field.stringValue()) for field in doc.getFields())
			docs.append(doc_dict)
			# print doc
		return docs
예제 #4
0
 def findLiteral(self, instanceUri, propertyURI):
     labels = list()
     try:
         analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
         fields = [FreyaConstants.INST_FEATURE_LKB, FreyaConstants.PROPERTY_FEATURE_LKB]
         flags = [BooleanClause.Occur.MUST, BooleanClause.Occur.MUST]
         labelOrTitleUris = "\"" + propertyURI + "\""
         queries = ["\"" + QueryParser.escape(instanceUri) + "\"", QueryParser.escape(labelOrTitleUris)]
         query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT, queries, fields, flags, analyzer)
         result = self._searcher.search(query, 1)
         logging.debug("For " + str(query) + " : " + str(result.totalHits))
         freq = result.totalHits
         if freq > 0:
             result = self._searcher.search(query, freq)
         hits = pyJava.JArray2List(result.scoreDocs)
         # for (ScoreDoc hit : hits) {
         indexus = 0
         while indexus < len(hits):
             hit = hits[indexus]
             doc = self._searcher.doc(hit.doc)
             labels.append(doc.get(FreyaConstants.FIELD_EXACT_CONTENT))
             indexus += 1
     except Exception as e:#CorruptIndexException(e):
         print e.message
         logging.error("Error")
     return labels
예제 #5
0
 def searchForClass(self, inst, pred):
     classUris = list()
     fields = [FreyaConstants.INST_FEATURE_LKB, FreyaConstants.PROPERTY_FEATURE_LKB]
     flags = [BooleanClause.Occur.MUST, BooleanClause.Occur.MUST]
     queries = ["\"" + QueryParser.escape(inst) + "\"", "\"" + QueryParser.escape(pred) + "\""]
     try:
         analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
         query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT, queries, fields, flags, analyzer)
         result = self._searcher.search(query, 1)
         logging.info("For " + str(query) + " : " + str(result.totalHits))
         freq = result.totalHits
         if freq > 0:
             result = self._searcher.search(query, freq)
         hits = pyJava.JArray2List(result.scoreDocs)
         # for (ScoreDoc hit : hits) {
         indexus = 0
         while indexus < len(hits):
             hit = hits[indexus]
             doc = self._searcher.doc(hit.doc)
             classUris.append(doc.get(FreyaConstants.CLASS_FEATURE_LKB))
             indexus += 1
     except Exception as e:#ParseException(e):
         print e.message
         logging.error("Error")
     return classUris
예제 #6
0
 def findSubClasses(self, classUri): #RESOLVE multifieldqueryparser DOCUMENTATION PROBLEM!!!!
     propertyURI = "http://www.w3.org/2000/01/rdf-schema#subClassOf"
     subClasses = list()
     try:
         analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
         fields = [FreyaConstants.CLASS_FEATURE_LKB, FreyaConstants.PROPERTY_FEATURE_LKB]
         flags = [BooleanClause.Occur.MUST, BooleanClause.Occur.MUST]
         subClassUri = "\"" + QueryParser.escape(propertyURI) + "\""
         queries = ["\"" + QueryParser.escape(classUri) + "\"", subClassUri]
         query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT,queries, fields,flags,analyzer)
         result = self._searcher.search(query, 1)
         logging.debug("For " + str(query) + " : " + str(result.totalHits))
         freq = result.totalHits
         if freq > 0:
             result = self._searcher.search(query, freq)
         hits = pyJava.JArray2List(result.scoreDocs)
         # for (ScoreDoc hit : hits) {
         indexus = 0
         while indexus < len(hits):
             hit = hits[indexus]
             doc = self._searcher.doc(hit.doc)
             subClasses.append(doc.get(FreyaConstants.INST_FEATURE_LKB))
             indexus += 1
     except Exception as e:#CorruptIndexException(e):
         print e.message
         logging.error("Error")
     return subClasses
예제 #7
0
파일: retriever.py 프로젝트: kevkid/YIF
def SearchQuery(queryString, fields, classification): 
    #if __name__ == "__main__":
    #if __name__ == "retriever":
    location = web.__path__[0] + "/static/web/files/index/index.articles"
    #lucene.initVM()
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
    reader = IndexReader.open(SimpleFSDirectory(File(location)))
    searcher = IndexSearcher(reader)
    #multi field query: http://stackoverflow.com/questions/2005084/how-to-specify-two-fields-in-lucene-queryparser
    
    query = MultiFieldQueryParser(Version.LUCENE_4_10_1, fields, analyzer)
    #query.setDefaultOperator(QueryParserBase.AND_OPERATOR)
    query = MultiFieldQueryParser.parse(query, queryString)
    #query.parse(queryString)#"Shigella sonnei"
    #query = QueryParser(Version.LUCENE_4_10_1, "abstract", analyzer).parse(queryString)#"Shigella sonnei"

    MAX = 10000
    hits = searcher.search(query, MAX)
 
    print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query)
    paths = []
    pmcids = []
    documentDict = {}
    for hit in hits.scoreDocs:
        doc = searcher.doc(hit.doc)
        pmcids.append(doc.get("pmcid"))
        docDict = {"title" : doc.get("title")}#we can add any other field we want...
        documentDict[doc.get("pmcid")] = docDict 
    
    #Where we get the images for all the pmcids    
    images = get_image_pmcid(pmcids, classification)#should take in pmcids and class
    #create dictionary of images with pmcid being their key
    imagesDict = {}
    for img in images:
        img_pmcid = img.get("pmcid") 
        if img_pmcid in imagesDict.keys():
            imagesDict[img_pmcid].append(img.get("filepath") + "/" + img.get("figureid"))
            
        else:
            imagesDict[img_pmcid] = [(img.get("filepath") + "/" + img.get("figureid"))]
            
    #for each pmcid, we will assign an image to it for the search results
    for pmcid in pmcids:
        if imagesDict:
            docDict = documentDict[pmcid]
            docDict["imgURL"] = imagesDict[pmcid][0] 
            documentDict[pmcid] = docDict 
        else:
            docDict = documentDict[pmcid]
            docDict["imgURL"] = "images/NoImageAvailable.jpg"
            documentDict[pmcid] = docDict
    
    #END - Where we get the images for all the pmcids
    
    
    return documentDict
예제 #8
0
def func_cross(former, mid, last):
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    STORE_DIR = "index_tb_new"
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    query = former + ' ' + ' ' + mid + ' ' + last
    fields = ["former", "mid", "last"]
    clauses = [
        BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD,
        BooleanClause.Occur.SHOULD
    ]
    parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields, analyzer)
    parser.setDefaultOperator(QueryParserBase.AND_OPERATOR)
    query = MultiFieldQueryParser.parse(parser, query)

    scoreDocs = searcher.search(query, 200).scoreDocs
    results = process(scoreDocs, searcher)
    return results
예제 #9
0
def search_lucene(fields_,terms_,requirements_,searcher,index=0):
  terms = []
  fields = []
  requirements = []
  for (i,x) in enumerate(terms_):
    terms.append(x[index])
    fields.append(fields_[i][index])
    requirements.append(requirements_[i][index])
  sys.stdout.write("Running query %s: (\"%s\") in fields (%s) with requirements (%s)\n" % (sym2name[index],"\",\"".join(terms),",".join(fields),",".join([sym2name[str(x)] for x in requirements])))
  query = MultiFieldQueryParser.parse(Version.LUCENE_4_10_1,terms,fields,requirements,analyzer2)
  return(terms,fields,requirements,searcher.search(query, NUM_TO_RETRIEVE))
예제 #10
0
    def __init__(self,
                 index_dir,
                 search_fields=['canonical_url', 'title', 'meta', 'content'],
                 unique_field='uq_id_str',
                 boost=dict(canonical_url=4.0,
                            title=8.0,
                            meta=2.0,
                            content=1.0),
                 date_format='%Y-%m-%dT%H:%M:%S'):
        """Constructor of Searcher.

        Parameters
        ----------
        index_dir : string
            The location of lucene index.
        search_fields : list
            A list of field names indicating fields to search on.
        unique_field : string
            The field name, on which the duplication should avoid.
        boost : dict
            This dict control the weight when computing score.
        date_format : string
            Convert the string into datetime. Should consistent with the
            index part.
        """
        self.index_dir = index_dir
        self.search_fields = search_fields
        self.sort_by_recent = Sort(
            SortField('date_published', SortField.Type.STRING, True))
        self.store = FSDirectory.open(File(index_dir))
        self.reader = DirectoryReader.open(self.store)
        self.isearcher = IndexSearcher(self.reader)
        self.analyzer = StandardAnalyzer()
        self.dup_filter = DuplicateFilter(unique_field)
        self.boost_map = HashMap()
        for k, v in boost.iteritems():
            self.boost_map.put(k, Float(v))
        self.mul_parser = MultiFieldQueryParser(search_fields, self.analyzer,
                                                self.boost_map)
        self.date_format = date_format
예제 #11
0
    def doc_search(self, field, keywords, numHits):
        if field != 'All':
            analyzer = StandardAnalyzer()
            parser = QueryParser(field, analyzer)
            query = parser.parse(keywords)

            # self.lReader.getDocCount("title");

            try:
                collector = TopScoreDocCollector.create(numHits)
                self.lSearcher.search(query, collector)
                hits = collector.topDocs().scoreDocs

            except RuntimeError:
                print "Score docoment run fail"
            self.hits = hits
            self.field = field
            return hits
        else:
            analyzer = WhitespaceAnalyzer()
            parser = MultiFieldQueryParser(['Title', 'Body'], analyzer)
            query = MultiFieldQueryParser.parse(parser, keywords)

            # self.lReader.getDocCount("title");

            try:
                collector = TopScoreDocCollector.create(numHits)
                self.lSearcher.search(query, collector)
                hits = collector.topDocs().scoreDocs

            except RuntimeError:
                print "Score docoment run fail"
            self.hits = hits
            self.field = field
            return hits

            self.hits = hits
            self.field = field
            return hits
예제 #12
0
def search_samples_lucene(sample_map,sampleq,sample_set,ra,stream_sample_metadata=False):
    (fields,queries,booleans) = lucene_sample_query_parse(sampleq)
    query = MultiFieldQueryParser.parse(Version.LUCENE_4_10_1, queries, fields, booleans, snapconf.LUCENE_ANALYZER)
    hits = searcher.search(query, snapconf.LUCENE_MAX_SAMPLE_HITS)
    #if we get nothing, try with the backup analyzer
    if hits.totalHits == 0:
        query = MultiFieldQueryParser.parse(Version.LUCENE_4_10_1, queries, fields, booleans, snapconf.LUCENE_BACKUP_ANALYZER)
        hits = searcher.search(query, snapconf.LUCENE_MAX_SAMPLE_HITS)
    if DEBUG_MODE: 
        sys.stderr.write("Found %d document(s) that matched query '%s':\n" % (hits.totalHits, sampleq))
    if stream_sample_metadata:
        sys.stdout.write("DataSource:Type\tLucene TF-IDF Score\t%s\n" % (snapconf.SAMPLE_HEADER))
    for hit in hits.scoreDocs:
        doc = searcher.doc(hit.doc)
        sid = doc.get(snapconf.SAMPLE_ID_FIELD_NAME)
        #track the sample ids if asked to
        if sid != None and len(sid) >= 1:
            if sample_set != None:
                sample_set.add(sid)
            #stream back the full sample metadata record from the in-memory dictionary
            if stream_sample_metadata:
                sys.stdout.write("%s:S\t%s\t%s\n" % (snapconf.DATA_SOURCE,str(hit.score),sample_map[sid]))
예제 #13
0
    def get_sorted_results(self, query):
        SHOULD = BooleanClause.Occur.SHOULD
        parsed_query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT, query, ['docno', 'content'], [SHOULD, SHOULD], self.analyzer)

        reader = IndexReader.open(self.directory)
        searcher = IndexSearcher(reader)

        searcher.setSimilarity(BM25Similarity())
        topDocs = searcher.search(parsed_query, 10)

        j = 0
        for i in topDocs.scoreDocs:
            d = searcher.doc(i.doc)

            print 'No. %02d: ' % (j + 1) + d['docno'] + ' ' + str(i.score)

            j += 1
예제 #14
0
    def test_searchDocumentsWithMultiField(self):
        """
        Tests searching with MultiFieldQueryParser
        """

        self.test_indexDocument()
        store = self.openStore()
        searcher = None
        try:
            searcher = self.getSearcher(store)
            SHOULD = BooleanClause.Occur.SHOULD
            query = MultiFieldQueryParser.parse("value", ["title", "docid"],
                                                [SHOULD, SHOULD],
                                                self.getAnalyzer())
            topDocs = searcher.search(query, 50)
            self.assertEqual(1, topDocs.totalHits)
        finally:
            self.closeStore(store)
예제 #15
0
def search_samples_lucene(sample_map,sampleq,sample_set,stream_sample_metadata=False):
    (fields,queries,booleans) = lucene_sample_query_parse(sampleq)
    query = MultiFieldQueryParser.parse(Version.LUCENE_4_10_1, queries, fields, booleans, analyzer)
    #query = MultiFieldQueryParser.parse(Version.LUCENE_4_10_1, ['human AND adult AND brain'], ['description_t'], [BooleanClause.Occur.MUST], analyzer)
    hits = searcher.search(query, snapconf.LUCENE_MAX_SAMPLE_HITS)
    if DEBUG_MODE: 
        sys.stderr.write("Found %d document(s) that matched query '%s':\n" % (hits.totalHits, sampleq))
    if stream_sample_metadata:
        sys.stdout.write("DataSource:Type\t%s\n" % (snapconf.SAMPLE_HEADER))
    for hit in hits.scoreDocs:
        doc = searcher.doc(hit.doc)
        sid = doc.get("intropolis_sample_id_i")
        #track the sample ids if asked to
        if sample_set != None:
            sample_set.add(sid)
        #stream back the full sample metadata record from the in-memory dictionary
        if stream_sample_metadata:
            sys.stdout.write("%s:S\t%s\n" % (snapconf.DATA_SOURCE,sample_map[sid]))
예제 #16
0
    def test_searchDocumentsWithMultiField(self):
        """
        Tests searching with MultiFieldQueryParser
        """

        self.test_indexDocument()
        store = self.openStore()
        searcher = None
        try:
            searcher = self.getSearcher(store)
            SHOULD = BooleanClause.Occur.SHOULD
            query = MultiFieldQueryParser.parse("value", ["title", "docid"],
                                                [SHOULD, SHOULD],
                                                self.getAnalyzer())
            topDocs = searcher.search(query, 50)
            self.assertEquals(1, topDocs.totalHits)
        finally:
            self.closeStore(store)
예제 #17
0
    def search(self, query):
        SHOULD = BooleanClause.Occur.SHOULD
        q = MultiFieldQueryParser.parse(query, FIELDS, [SHOULD, SHOULD],
                                        StandardAnalyzer())
        #		print(q.toString())
        topHits = 100
        scores = self._indexSearcher.search(q, topHits).scoreDocs
        results = []
        for i in range(10):
            doc = self._indexSearcher.doc(scores[i].doc)
            results.append(i + 1, scores[i].doc, doc.get("filename"),
                           doc.get("contents"))


#			print(i+1)
#			print("Score: ", scores[i].doc)
#			print("Title: ", doc.get("filename"))
#			print("Contents: ", doc.get("contents"))
        return results
예제 #18
0
    def get_sorted_results(self, query):
        SHOULD = BooleanClause.Occur.SHOULD
        parsed_query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT,
                                                   query, ['docno', 'content'],
                                                   [SHOULD, SHOULD],
                                                   self.analyzer)

        reader = IndexReader.open(self.directory)
        searcher = IndexSearcher(reader)

        searcher.setSimilarity(BM25Similarity())
        topDocs = searcher.search(parsed_query, 10)

        j = 0
        for i in topDocs.scoreDocs:
            d = searcher.doc(i.doc)

            print 'No. %02d: ' % (j + 1) + d['docno'] + ' ' + str(i.score)

            j += 1
예제 #19
0
def query(q):
    """
    :param q:
    :return:search result, type list, eg. [{'name', 'path'}...]
    """
    lucene.initVM()
    index_store_dir = current_app.config['INDEX_STORE_DIR']
    directory = SimpleFSDirectory(File(index_store_dir))
    print 'directory', directory
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    parse = MultiFieldQueryParser(Version.LUCENE_CURRENT, ['name', 'title', 'content'], analyzer)
    query = MultiFieldQueryParser.parse(parse, q)
    scoreDocs = searcher.search(query, 50).scoreDocs
    print "%s total matching documents." % len(scoreDocs)

    result = []
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        item = dict(date=doc.get('date'), name=doc.get('name'), title=doc.get('title'), summary=doc.get('summary'))
        result.append(item)
    return result
예제 #20
0
    def searchGivenHallmarks(self, query, hallmarksList, hallmarksField, maxReturnLimit):
        qList = [query]
        qList.extend(hallmarksList)
	#print(qList)
        fList = ["text"]
        fList.extend([hallmarksField]*len(hallmarksList))
	#print(fList)
        flagList = [BooleanClause.Occur.MUST]
        flagList.extend([BooleanClause.Occur.MUST]*len(hallmarksList))
        #print(flagList)
        qp = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT, qList, fList, flagList, self.analyzer)
        #print (qp)
        hits = self.searcher.search(qp, maxReturnLimit)
        result = []
        for hit in hits.scoreDocs:
            record = dict()
            doc = self.searcher.doc(hit.doc)
            record["id"] = doc.get("id")
            record["pos"]  = doc.get("pos")
            record["hallmarks"] = doc.get("hallmarks").split()
            #record["hallmarks-exp"] = doc.get("hallmarks-exp").split()
            record["text"] = doc.get("text")
            result.append(record)
        return result
예제 #21
0
    def multiFieldsPairSearch(self, pair, sim):
        """
        Method that searches through documents using only content_section Field
        searchDir : the path to the folder that contains the index.
        """
        # Now search the index:
        title = pair[0].replace('_', ' ')
        content = pair[1]
        parser = MultiFieldQueryParser(["content_section", "title_article"],
                                       self.analyzer)
        parser.setDefaultOperator(QueryParserBase.OR_OPERATOR)
        query1 = MultiFieldQueryParser.parse(parser, QueryParser.escape(title))
        query2 = MultiFieldQueryParser.parse(parser,
                                             QueryParser.escape(content))

        bq = BooleanQuery.Builder()
        bq.add(query1, BooleanClause.Occur.FILTER)
        bq.add(query2, BooleanClause.Occur.SHOULD)

        self.searcher.setSimilarity(sim)
        hits = self.searcher.search(bq.build(), 6).scoreDocs
        return hits
예제 #22
0
class Searcher():
    """A simple interface to search articles.

    In this class `MultiFieldQueryParse`, `DuplicateFilter` are used to
    accomplish our application: query should apply on multiple fields,
    duplication should be avoid.
    """

    def __init__(self, index_dir,
                 search_fields=['canonical_url', 'title', 'meta', 'content'],
                 unique_field='uq_id_str',
                 boost=dict(canonical_url=4.0, title=8.0,
                            meta=2.0, content=1.0),
                 date_format='%Y-%m-%dT%H:%M:%S'):
        """Constructor of Searcher.

        Parameters
        ----------
        index_dir : string
            The location of lucene index.
        search_fields : list
            A list of field names indicating fields to search on.
        unique_field : string
            The field name, on which the duplication should avoid.
        boost : dict
            This dict control the weight when computing score.
        date_format : string
            Convert the string into datetime. Should consistent with the
            index part.
        """
        self.index_dir = index_dir
        self.search_fields = search_fields
        self.sort_by_recent = Sort(SortField('date_published',
                                             SortField.Type.STRING, True))
        self.store = FSDirectory.open(File(index_dir))
        self.reader = DirectoryReader.open(self.store)
        self.isearcher = IndexSearcher(self.reader)
        self.analyzer = StandardAnalyzer()
        self.dup_filter = DuplicateFilter(unique_field)
        self.boost_map = HashMap()
        for k, v in boost.iteritems():
            self.boost_map.put(k, Float(v))
        self.mul_parser = MultiFieldQueryParser(search_fields, self.analyzer,
                                                self.boost_map)
        self.date_format = date_format

    def prepare_chained_filter(self, dt1, dt2):
        """Return a chained filter."""
        return ChainedFilter(
            [self.dup_filter,
             TermRangeFilter('date_published',
                             BytesRef(dt1.strftime(self.date_format)),
                             BytesRef(dt2.strftime(self.date_format)),
                             True, True)],
            [ChainedFilter.AND, ChainedFilter.AND]
        )

    def refresh(self):
        """Refresh the searsher, if index is changed."""
        nireader = DirectoryReader.openIfChanged(self.reader)
        if nireader:
            self.reader.close()
            self.reader = nireader
            self.isearcher = IndexSearcher(self.reader)
            logger.debug('Index file changed, freshed')
        else:
            logger.debug('Index file did not change.')

    def fetch_one_doc(self, score_doc):
        """Fetch one document from the scored doc results."""
        doc = self.isearcher.doc(score_doc.doc)
        return (
            doc.getField("group_id").numericValue().intValue(),
            doc.get("canonical_url"),
            doc.get("title"),
            doc.get("date_published"),
            doc.get("domain"),
            doc.get("site_type"),
            score_doc.score,
        )

    def search(self, query, n1=100, n2=100000,
               sort_by='relevant',
               use_lucene_syntax=False,
               min_score_of_recent_sorting=0.4,
               min_date_published=None):
        """Return the matched articles from lucene.

        Parameters
        ----------
        query : string
            The query string.
        n1 : int
            How many result finally returned.
        n2 : int
            How many search results returned when sort by recent.
        sort_by : string
            {'relevant', 'recent'}, the sorting order when doing lucene searching.
        min_score_of_recent_sorting : float
            The min score when sorting by 'recent'.
        min_date_published : datetime<Plug>(neosnippet_expand)
            The min date_published when filtering lucene searching results.

        Returns
        -------
        tuple
            (total_hits, df), where total_hits represents the total number
            of hits and df is a pandas.DataFrame object. df.columns = ['id',
            'canonical_url', 'title', 'date_published', 'domain', 'site_type',
            'score']
        """
        if min_date_published is not None:
            dt2 = datetime.utcnow()
            if isinstance(min_date_published, datetime):
                dt1 = min_date_published
            elif isinstance(min_date_published, basestring):
                dt1 = utc_from_str(min_date_published)
            sf = self.prepare_chained_filter(dt1, dt2)
        else:
            sf = self.dup_filter
        try:
            if use_lucene_syntax is False:
                query = clean_query(query)
            q = self.mul_parser.parse(self.mul_parser, query)
            logger.debug('Parsed query: %s', q)
        except Exception as e:
            logger.error(e)
            if use_lucene_syntax is True:
                raise APIParseError("""Error when parse the query string! \
You are quering with lucene syntax, be careful of your query string!""")
            else:
                raise APIParseError('Error when parse the query string!')

        cnames = ['id', 'canonical_url', 'title', 'date_published',
                  'domain', 'site_type', 'score']
        if sort_by == 'relevant':
            top_docs = self.isearcher.search(q, sf, n1)
            score_docs = top_docs.scoreDocs
            total_hits = top_docs.totalHits
            if total_hits == 0:
                df = pd.DataFrame()
            else:
                records = [self.fetch_one_doc(sd) for sd in score_docs]
                df = pd.DataFrame(records, columns=cnames)
                df['date_published'] = pd.to_datetime(df['date_published'])
            return total_hits, df
        elif sort_by == 'recent':
            counter = 0
            records = []
            top_field_docs = self.isearcher.search(q, sf, n2,
                                                   self.sort_by_recent,
                                                   True, True)
            if top_field_docs.maxScore >= min_score_of_recent_sorting:
                for sd in top_field_docs.scoreDocs:
                    if sd.score >= min_score_of_recent_sorting:
                        records.append(self.fetch_one_doc(sd))
                        counter += 1
                        if counter == n1:
                            break
            if counter == 0:
                df = pd.DataFrame()
            else:
                df = pd.DataFrame(records, columns=cnames)
                df['date_published'] = pd.to_datetime(df['date_published'])
            return counter, df
    def search(self, command, num, use_clf):
        print("log1", command, num, use_clf)
        self.vm.attachCurrentThread()
        searcher = self.searcher

        print("command", command)

        if (not self.reT.search(command)):
            if (use_clf):
                print("sentence feed to classify", command)
                probs = self.classifier.classify(command)
                command = self.text.seg(command)
                command = self.text.remove_stop_word(command)
                # command = self.text.replace_white_space_with_dash(command)
                key = sorted(range(len(self.keys)),
                             key=lambda i: probs[i],
                             reverse=True)
                key_use = []
                key_use.append(key[0])
                for i in key[1:]:
                    if probs[i] > 0.3 or probs[i] - probs[key[0]] > -0.1:
                        key_use.append(i)

                command_final = self.keys[key_use[0]] + ":(" + command + ")"
                for i in key_use[1:]:
                    command_final = "%s OR %s:(%s)" % (command_final,
                                                       self.keys[i], command)
                command = command_final

                # command = "Title:\"2016 吉 07 民终 491号 包颜峰诉\""
                # command = "PubDate:\"2016 11 24\""
                # command = "WBSB:浙江省 WBSB:苍南县 WBSB:人民法院"
                print(command)
                # command = "Title:陕西省-高级-人民法院 Pubdate:陕西省-高级-人民法院"
                query = QueryParser("PubDate",
                                    WhitespaceAnalyzer()).parse(command)
                # parser =  MultiFieldQueryParser(['WBSB'], self.analyzer)
                # parser.setDefaultOperator(QueryParserBase.AND_OPERATOR)
                # query =parser.parse(QueryParserBase,command)

                # P = QueryParser('Pubdate', CJKAnalyzer())
                # query = MultiFieldQueryParser(['WBSB','Pubdate'],CJKAnalyzer()).parse(P,command)
                #
                #
                # # query = MultiFieldQueryParser(['WBSB',"title"], CJKAnalyzer()).getMultiFieldQuery(q)
                # # p = QueryParser('Title', CJKAnalyzer()).parse("你好 中国 你好 北京")
                # print(query)

                # fields = []
                # # fields = ["filename", "contents", "description"]
                #
                # for i in key_use:
                #     fields.append(self.keys[i])
                # flags = [BooleanClause.Occur.SHOULD]*len(fields)
                #
                # query=MultiFieldQueryParser.parse(command, fields, flags, WhitespaceAnalyzer())
                #
                print(query)

                scoreDocs = searcher.search(query, num).scoreDocs

                results = []

                for scoreDoc in scoreDocs:
                    doc = searcher.doc(scoreDoc.doc)
                    result = dict()
                    for i in self.keys:
                        result[i] = doc.get(i)
                    result['id'] = doc.get('id')
                    results.append(result)
                probs_tmp = ""
                for key, prob in zip(self.keys, probs):
                    probs_tmp += "%s:%2f " % (key, prob)
                probs = probs_tmp
                key_use_tmp = ""
                for i in key_use:
                    key_use_tmp += "%s " % (self.keys[i])
                key_use = key_use_tmp
                return results, probs, key_use

            else:
                command = self.text.seg(command)
                command = self.text.remove_stop_word(command)
                fields = self.keys
                flags = [BooleanClause.Occur.SHOULD] * len(fields)

                query = MultiFieldQueryParser.parse(command, fields, flags,
                                                    WhitespaceAnalyzer())

                # command_final = "Title:"+command
                # for i in self.keys[1:]:
                #     command_final = "%s OR %s:%s"% (command_final,i,command)
                # command=command_final
                # print("矣")
                # print(command)
                # query = QueryParser("Title", self.analyzer).parse(command)

                fields = self.keys
                flags = [BooleanClause.Occur.SHOULD] * len(fields)

                query = MultiFieldQueryParser.parse(command, fields, flags,
                                                    WhitespaceAnalyzer())
                print(query)
                scoreDocs = searcher.search(query, num).scoreDocs

                results = []

                for scoreDoc in scoreDocs:
                    doc = searcher.doc(scoreDoc.doc)
                    result = dict()
                    for i in self.keys:
                        result[i] = doc.get(i)
                    result['id'] = doc.get('id')
                    results.append(result)
                return results, [None] * len(self.keys), self.keys
        else:
            print('command', command)
            ps = self.reT.findall(command)
            print(ps)
            print(type(command))
            rem = self.reT.sub(command, ' ')
            print(ps)
            print(rem)
            q_t = []
            key_use = []
            for i in ps:

                f = i[1]
                data = i[4]
                rela = i[5]

                key_use.append(f)

                q_t.append(f)
                q_t.append(':')
                seg_t = self.text.seg(data)
                seg_t = self.text.remove_stop_word(seg_t)
                dash_t = self.text.replace_white_space_with_dash(seg_t)
                q_t.append(dash_t)
                if (rela):
                    q_t.append(" %s " % rela)
                print('tract pattern', q_t)
            q_f = "".join(q_t)
            print("final q", q_f)
            query = QueryParser("PubDate", SimpleAnalyzer()).parse(q_f)
            print("query", query)
            scoreDocs = searcher.search(query, num).scoreDocs

            results = []

            for scoreDoc in scoreDocs:
                doc = searcher.doc(scoreDoc.doc)
                result = dict()
                for i in self.keys:
                    result[i] = doc.get(i)
                result['id'] = doc.get('id')
                results.append(result)
            return results, [None] * len(key_use), key_use
예제 #24
0
def getRelatedArticles(pmcid):
    import tools.retriever as retriever
    location = web.__path__[0] + "/static/web/files/index/index.articles"
    #lucene.initVM()
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
    reader = IndexReader.open(SimpleFSDirectory(File(location)))
    searcher = IndexSearcher(reader)
    q = MultiFieldQueryParser(Version.LUCENE_4_10_1, ["pmcid"], analyzer)
        #query.setDefaultOperator(QueryParserBase.AND_OPERATOR)
    q = MultiFieldQueryParser.parse(q, pmcid)
    MAX = 10000
    hits = searcher.search(q, MAX)
    id = hits.scoreDocs[0].doc#searcher.doc(hits[0].scoredocs[0].doc)
    
    
    result = BooleanQuery()
    result.add(BooleanClause(q, BooleanClause.Occur.MUST_NOT))
    
    
    titlemlt = getSimilarityGenerator("title", 1, 1, 2)
    abstractmlt = getSimilarityGenerator("abstract", 2, 5, 2)
    citationmlt = getSimilarityGenerator("citation", 2, 5, 2)
    fulltextmlt = getSimilarityGenerator("fulltext", 2, 5, 2)
    keywordmlt = getSimilarityGenerator("keyword", 1, 1, 1)   
    
    titleQ = titlemlt.like(id);
    titleQ.setBoost(0.2)
    abstractQ = abstractmlt.like(id);
    abstractQ.setBoost(0.1);

    #Do we even want to include a query for similar citations?
    citationQ = citationmlt.like(id);
    citationQ.setBoost(0.0);

    fulltextQ = fulltextmlt.like(id);
    fulltextQ.setBoost(0.0);

    keywordQ = keywordmlt.like(id);
    keywordQ.setBoost(0.0);


    result.add(BooleanClause(titleQ, BooleanClause.Occur.SHOULD));
    result.add(BooleanClause(abstractQ, BooleanClause.Occur.SHOULD));
    result.add(BooleanClause(citationQ, BooleanClause.Occur.SHOULD));
    result.add(BooleanClause(fulltextQ, BooleanClause.Occur.SHOULD));
    result.add(BooleanClause(keywordQ, BooleanClause.Occur.SHOULD));
    
    hits = searcher.search(result, 5)
    print "Found %d document(s) that matched query '%s':" % (hits.totalHits, result)
    paths = []
    pmcids = []
    documentDict = {}
    for hit in hits.scoreDocs:
        doc = searcher.doc(hit.doc)
        pmcids.append(doc.get("pmcid"))
        docDict = {"title" : doc.get("title")}#we can add any other field we want...
        documentDict[doc.get("pmcid")] = docDict 
    
    #Where we get the images for all the pmcids    
    images = retriever.get_image_pmcid(pmcids, "all")#should take in pmcids and class
    #create dictionary of images with pmcid being their key
    imagesDict = {}
    for img in images:
        img_pmcid = img.get("pmcid") 
        if img_pmcid in imagesDict.keys():
            imagesDict[img_pmcid].append(img.get("filepath") + "/" + img.get("figureid"))
            
        else:
            imagesDict[img_pmcid] = [(img.get("filepath") + "/" + img.get("figureid"))]
            
    #for each pmcid, we will assign an image to it for the search results
    for pmcid in pmcids:
        if imagesDict:
            if pmcid in imagesDict.keys():
                 docDict = documentDict[pmcid]
                 docDict["imgURL"] = imagesDict[pmcid][0] 
                 documentDict[pmcid] = docDict
            else:
                docDict = documentDict[pmcid]
                docDict["imgURL"] = "images/NoImageAvailable.jpg"
                documentDict[pmcid] = docDict
        else:
            docDict = documentDict[pmcid]
            docDict["imgURL"] = "images/NoImageAvailable.jpg"
            documentDict[pmcid] = docDict
    
    #END - Where we get the images for all the pmcids
    
    
    return documentDict
예제 #25
0
class Searcher():
    """A simple interface to search articles.

    In this class `MultiFieldQueryParse`, `DuplicateFilter` are used to
    accomplish our application: query should apply on multiple fields,
    duplication should be avoid.
    """
    def __init__(self,
                 index_dir,
                 search_fields=['canonical_url', 'title', 'meta', 'content'],
                 unique_field='uq_id_str',
                 boost=dict(canonical_url=4.0,
                            title=8.0,
                            meta=2.0,
                            content=1.0),
                 date_format='%Y-%m-%dT%H:%M:%S'):
        """Constructor of Searcher.

        Parameters
        ----------
        index_dir : string
            The location of lucene index.
        search_fields : list
            A list of field names indicating fields to search on.
        unique_field : string
            The field name, on which the duplication should avoid.
        boost : dict
            This dict control the weight when computing score.
        date_format : string
            Convert the string into datetime. Should consistent with the
            index part.
        """
        self.index_dir = index_dir
        self.search_fields = search_fields
        self.sort_by_recent = Sort(
            SortField('date_published', SortField.Type.STRING, True))
        self.store = FSDirectory.open(Paths.get(index_dir))
        self.reader = DirectoryReader.open(self.store)
        self.isearcher = IndexSearcher(self.reader)
        self.analyzer = StandardAnalyzer()
        self.boost_map = HashMap()
        for k, v in boost.items():
            self.boost_map.put(k, Float(v))
        self.mul_parser = MultiFieldQueryParser(search_fields, self.analyzer,
                                                self.boost_map)
        self.date_format = date_format

    def query_between_dates(self, dt1, dt2, original_query=None):
        '''Update the given query to only allow records between dt1 and dt2.'''
        return TermRangeQuery(
            'date_published',  # Field
            BytesRef(dt1.strftime(self.date_format)),  # Lower bound
            BytesRef(dt2.strftime(self.date_format)),  # Upper bound
            True,  # Include lower bound
            True  # Include upper bound
        )

    def refresh(self):
        """Refresh the searsher, if index is changed."""
        nireader = DirectoryReader.openIfChanged(self.reader)
        if nireader:
            self.reader.close()
            self.reader = nireader
            self.isearcher = IndexSearcher(self.reader)
            logger.debug('Index file changed, freshed')
        else:
            logger.debug('Index file did not change.')

    def fetch_one_doc(self, score_doc):
        """Fetch one document from the scored doc results."""
        doc = self.isearcher.doc(score_doc.doc)
        return (
            doc.getField("group_id").numericValue().intValue(),
            doc.get("canonical_url"),
            doc.get("title"),
            doc.get("date_published"),
            doc.get("domain"),
            doc.get("site_type"),
            score_doc.score,
        )

    def search(self,
               query,
               n1=100,
               n2=100000,
               sort_by='relevant',
               use_lucene_syntax=False,
               min_score_of_recent_sorting=0.4,
               min_date_published=None):
        """Return the matched articles from lucene.

        Parameters
        ----------
        query : string
            The query string.
        n1 : int
            How many result finally returned.
        n2 : int
            How many search results returned when sort by recent.
        sort_by : string
            {'relevant', 'recent'}, the sorting order when doing lucene searching.
        min_score_of_recent_sorting : float
            The min score when sorting by 'recent'.
        min_date_published : datetime
            The min date_published when filtering lucene searching results.

        Returns
        -------
        tuple
            (total_hits, df), where total_hits represents the total number
            of hits and df is a pandas.DataFrame object. df.columns = ['id',
            'canonical_url', 'title', 'date_published', 'domain', 'site_type',
            'score']
        """
        if min_date_published is not None:
            dt2 = datetime.utcnow()
            if isinstance(min_date_published, datetime):
                dt1 = min_date_published
            elif isinstance(min_date_published, str):
                dt1 = utc_from_str(min_date_published)
            q_dates = self.query_between_dates(dt1, dt2)
        try:
            if use_lucene_syntax is False:
                query = clean_query(query)
            q = self.mul_parser.parse(self.mul_parser, query)
            logger.warning(q)
            if 'date_published:' in query:
                end = query.find('AND date_published')
                q_without_date_publushed = query[:end]
                logger.warning(q_without_date_publushed)
                q = self.mul_parser.parse(self.mul_parser,
                                          q_without_date_publushed)
                date_published_splits = query.split('date_published:[')
                date_range = date_published_splits[len(date_published_splits) -
                                                   1]
                date_range = date_range[:-1]
                logger.warning(date_range)
                if 'TO' in date_range:
                    date_range_splits = date_range.split('TO')
                    dt1_string = date_range_splits[0]
                    # handling when regex presents
                    if '*' in dt1_string:
                        date1_end = dt1_string.find('*') - 1
                        dt1_string = dt1_string[:date1_end]
                        logger.warning(dt1_string)
                    dt1 = utc_from_str(dt1_string)
                    dt2_string = date_range_splits[1]
                    if '*' in dt2_string:
                        date2_end = dt2_string.find('*') - 1
                        dt2_string = dt2_string[:date2_end]
                        logger.warning(dt2_string)
                    dt2 = utc_from_str(dt2_string)
                    query_dates = self.query_between_dates(dt1, dt2)
                    q = combine_queries(q, query_dates)
            if min_date_published is not None:
                q = combine_queries(q, q_dates)
            logger.warning('Parsed query: %s', q)
        except Exception as e:
            logger.error(e)
            if use_lucene_syntax is True:
                raise APIParseError("""Error when parse the query string! \
You are quering with lucene syntax, be careful of your query string!""")
            else:
                raise APIParseError('Error when parse the query string!')

        cnames = [
            'id', 'canonical_url', 'title', 'date_published', 'domain',
            'site_type', 'score'
        ]
        if sort_by == 'relevant':
            top_docs = self.isearcher.search(q, n1)
            score_docs = top_docs.scoreDocs
            total_hits = top_docs.totalHits
            if total_hits == 0:
                df = pd.DataFrame()
            else:
                records = [self.fetch_one_doc(sd) for sd in score_docs]

                # Index in each record of canonical URL and title
                canonical_url, title = 1, 2
                # Store 2-tuples of (site, article title) as keys in dict then
                # turn back to list
                unique_docs = dict()
                for record in records:
                    key = (record[canonical_url], record[title])
                    if key not in unique_docs:
                        unique_docs[key] = record
                # Include only unique records
                records = list(unique_docs.values())
                df = pd.DataFrame(records, columns=cnames)
                df['date_published'] = pd.to_datetime(df['date_published'])
            return total_hits, df
        elif sort_by == 'recent':
            counter = 0
            records = []
            top_field_docs = self.isearcher.search(q, n2, self.sort_by_recent,
                                                   True, True)
            if top_field_docs.maxScore >= min_score_of_recent_sorting:
                for sd in top_field_docs.scoreDocs:
                    if sd.score >= min_score_of_recent_sorting:
                        records.append(self.fetch_one_doc(sd))
                        counter += 1
                        if counter == n1:
                            break
            if counter == 0:
                df = pd.DataFrame()
            else:
                df = pd.DataFrame(records, columns=cnames)
                df['date_published'] = pd.to_datetime(df['date_published'])
            return counter, df
예제 #26
0
indexDir = File("/tmp/github")

# 1. open the index
analyzer = KeywordAnalyzer()
index = SimpleFSDirectory(indexDir)
reader = IndexReader.open(index)
n_docs = reader.numDocs()
print("Index contains %d documents." % n_docs)

# 2. parse the query from the command line
# a = {"typed_method_call": WhitespaceAnalyzer()}
# wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a)

query_string = "HttpURLConnection.disconnect Exception.printStackTrace BufferedReader.close HttpURLConnection.setRequestProperty HttpURLConnection.setRequestMethod DataOutputStream.writeBytes HttpURLConnection.getInputStream DataOutputStream.close HttpURLConnection.setUseCaches StringBuffer.append URL.openConnection HttpURLConnection.getOutputStream Integer.toString String.getBytes StringBuffer.toString HttpURLConnection.setDoOutput BufferedReader.readLine DataOutputStream.flush HttpURLConnection.setDoInput"
query_parser = MultiFieldQueryParser(Version.LUCENE_CURRENT,
                                     ["typed_method_call"], analyzer)

#base_query = getSpanNearQuery(analyzer, query_string)

base_query = query_parser.parse(query_string)

#http://shaierera.blogspot.com/2013/09/boosting-documents-in-lucene.html
# boost_query = FunctionQuery( LongFieldSource("view_count"))
#query = CustomScoreQuery(base_query, boost_query)

# queryparser = QueryParser(Version.LUCENE_CURRENT, "title", analyzer)
# query = queryparser.parse(query_string)

# 3. search the index for the query
# We retrieve and sort all documents that match the query.
# In a real application, use a TopScoreDocCollector to sort the hits.
예제 #27
0
 def getIntersectionCount(self, query, countTermString, sfield, cfield):
     qp = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT,[query,countTermString],[sfield,cfield],[BooleanClause.Occur.MUST,BooleanClause.Occur.MUST],self.analyzer)
     collector = TotalHitCountCollector()
     self.searcher.search(qp, collector)
     return collector.getTotalHits()
예제 #28
0
indexDir = File("/tmp/stackoverflow")

# 1. open the index
analyzer = PorterAnalyzer(StandardAnalyzer(Version.LUCENE_CURRENT))
index = SimpleFSDirectory(indexDir)
reader = IndexReader.open(index)
n_docs = reader.numDocs()
print("Index contains %d documents." % n_docs)

# 2. parse the query from the command line
a = {"typed_method_call": KeywordAnalyzer(), "extends": KeywordAnalyzer()}
wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a)

query_string = "lucene get similar documents to the current one"
query_parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, ["title"],
                                     wrapper_analyzer)

#base_query = getSpanNearQuery(analyzer, query_string)

base_query = query_parser.parse(query_string)

#http://shaierera.blogspot.com/2013/09/boosting-documents-in-lucene.html
boost_query = FunctionQuery(LongFieldSource("view_count"))
query = CustomScoreQuery(base_query, boost_query)

# queryparser = QueryParser(Version.LUCENE_CURRENT, "title", analyzer)
# query = queryparser.parse(query_string)

# 3. search the index for the query
# We retrieve and sort all documents that match the query.
# In a real application, use a TopScoreDocCollector to sort the hits.