예제 #1
0
def irsolver(data_file, index) :
	from questions import get_input_data
	lucene.initVM()
	stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
	for s in stopwords :
		stops.add(s)
	analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
	reader = IndexReader.open(SimpleFSDirectory(File(index)))
	searcher = IndexSearcher(reader)
	pred = []
	mapp = { 1 : 'A', 2 : 'B', 3 : 'C', 4 : 'D'}

	idx, ques, ans = get_input_data(data_file)
	for acm, (idq, q, a) in enumerate(zip(idx, ques, ans)) :
		max_score = -1000000
		best_ans = 'A'
		for i, ai in enumerate(a):
			sc = query(q, ai, analyzer, searcher)
			print(acm, i, sc)
			if sc > max_score :
				max_score = sc
				best_ans = mapp[i+1]
		pred.append(best_ans)

	return idx, pred
예제 #2
0
def get_candidates(qatp):

    if prm.create_index:
        create_index()

    lucene.initVM()
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
    reader = IndexReader.open(SimpleFSDirectory(File(prm.index_folder)))
    searcher = IndexSearcher(reader)
    candidates = []
    n = 0
    for q,a,t,p in qatp:
        if n % 100 == 0:
            print 'finding candidates sample', n
        n+=1

        q = q.replace('AND','\\AND').replace('OR','\\OR').replace('NOT','\\NOT')
        query = QueryParser(Version.LUCENE_4_10_1, "text", analyzer).parse(QueryParser.escape(q))
        hits = searcher.search(query, prm.max_candidates)
        c = []
        for hit in hits.scoreDocs:
            doc = searcher.doc(hit.doc)
            c.append(doc.get("id"))

        candidates.append(c)
        
    return candidates
예제 #3
0
def evaluate_index(index_dir, context, analyzer):
    # eval time of indexing (overall)
    # we should also measure the elapsed time of
    # each index_document call seperately
    start = time.clock()
    Indexer(index_dir, context, analyzer)
    end = time.clock()
    duration = end-start

    directory = SimpleFSDirectory(File(index_dir))
    reader = IndexReader.open(directory)
    vocabulary = MultiFields.getTerms(reader, 'title')
    vocab_size = vocabulary.size()
    if vocab_size == -1:
        termsref = BytesRefIterator.cast_(vocabulary.iterator(None))
        vocab_size = sum(1 for _ in termsref)


    # print str(vocab_size) # size of vocabulary
    # print str(vocabulary.getDocCount()) # #docs that have at least one term for title field
    # print str(vocabulary.getSumTotalTermFreq()) # #tokens
    # print str(vocabulary.getSumDocFreq()) # #postings

    reader.close()
    return duration, vocab_size
예제 #4
0
파일: retriever.py 프로젝트: kevkid/YIF
def getRandomDoc2():
    
        location = web.__path__[0] + "/static/web/files/index/index.articles"
        #lucene.initVM()
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()
        analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
        reader = IndexReader.open(SimpleFSDirectory(File(location)))
        searcher = IndexSearcher(reader)
     
        #query = QueryParser(Version.LUCENE_4_10_1, "keywords", analyzer).parse(queryString)#"Shigella sonnei"
        MAX = 1000
        docNum = randrange(0, reader.maxDoc())
        doc = reader.document(docNum)
     
        #print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query)
        files = []
        fileRoots = []
        paths = []
        paths.append(doc.get("articlepath"))
        pth = paths[0].replace("/home/kevin/Downloads/","/home/kevin/git/YIF/imageFinder/web/static/web/")#os.path.join(tools.__path__,"static/web/images")
        for root, directories, filenames in os.walk(pth):#probably something wrong with the location
            for filename in filenames:
                if (".jpg" or ".gif" or ".png") in filename:
                    files.append(root.replace("/home/kevin/git/YIF/imageFinder/web/static/web/","") + "/" +filename)#temp, will need to chance            
                    fileRoots.append(root)
                    print (root.replace("/home/kevin/git/YIF/imageFinder/web/static/web/","") + "/" + filename)
        try: 
            rng = randrange(0, len(files))
        except:
            return -1
        else:
             return files[randrange(0, len(files))]
예제 #5
0
파일: retriever.py 프로젝트: kevkid/YIF
def get_image_pmcid(pmcid, classes = ""):
    fields = ["pmcid", "class"]
    docs = []
    location = web.__path__[0] + "/static/web/files/index/index.figures"
    #lucene.initVM()
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
    reader = IndexReader.open(SimpleFSDirectory(File(location)))
    searcher = IndexSearcher(reader)
    # multi field query: http://stackoverflow.com/questions/2005084/how-to-specify-two-fields-in-lucene-queryparser
    
    #query = MultiFieldQueryParser(Version.LUCENE_4_10_1, fields, analyzer)
    # query.setDefaultOperator(QueryParserBase.AND_OPERATOR)
    
    #query = query.parse(query, ('4175339','1'))
    # query.parse(queryString)#"Shigella sonnei"
    # query = QueryParser(Version.LUCENE_4_10_1, "abstract", analyzer).parse(queryString)#"Shigella sonnei"

    MAX = 10000
    #hits = searcher.search(query, MAX)
    if classes == "all":
        queryStr = "pmcid:(" + ' '.join(pmcid) +")"
    else:
        queryStr = "pmcid:(" + ' '.join(pmcid) +")" + " AND class:" + classes
    query = QueryParser(Version.LUCENE_4_10_1, "pmcid",analyzer)#needed to build a custom query
    q = query.parse(queryStr) 
    hits = searcher.search(q, MAX)
    for hit in hits.scoreDocs:#should only be one
        #print hit.score, hit.doc, hit.toString()
        docs.append(searcher.doc(hit.doc))
    return docs #This will return the image documents that belong to a pmcid(article)
def retrieve(indexdir, queries):
    lucene.initVM()
    f = open("results_lucene.txt", "w")
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
    reader = IndexReader.open(SimpleFSDirectory(File(indexdir)))
    searcher = IndexSearcher(reader)

    fields = ["title", "abstract", "authors"]

    st = PorterStemmer()
    for id, q in queries.iteritems():
        query = q
        tokenizer = RegexpTokenizer(r'\w+')
        qwords = tokenizer.tokenize(query)
        qwords_k = [st.stem(q) for q in qwords]
        query = " ".join(qwords_k)
        parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields, analyzer)
        parser.setDefaultOperator(QueryParserBase.OR_OPERATOR)
        query = MultiFieldQueryParser.parse(parser, query)
        MAX = 1000
        hits = searcher.search(query, MAX)
        # print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query)
        for i, hit in enumerate(hits.scoreDocs):
            f.write("%s Q0 %s %s %s G17R3\n" % (id, hit.doc+1, i+1, hit.score))
            # print hit.doc+1, hit.score
            # doc = searcher.doc(hit.doc)
            # print doc.get("authors").encode("utf-8")
    f.close()
예제 #7
0
 def __init__(self, path):
     print "Loading data.json..."
     with open(path, "r") as f:
         self.data = json.load(f)
     lucene.initVM()
     self.analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
     self.reader = IndexReader.open(SimpleFSDirectory(File("index/")))
     self.searcher = IndexSearcher(self.reader)
예제 #8
0
 def __init__(self,
              lucene_index_dir='lucene_index/',
              num_docs_to_return=100):
     lucene.initVM(vmargs=['-Djava.awt.headless=true'])
     directory = SimpleFSDirectory(File(lucene_index_dir))
     self.searcher = IndexSearcher(DirectoryReader.open(directory))
     self.num_docs_to_return = num_docs_to_return
     self.ireader = IndexReader.open(directory)
예제 #9
0
 def __init__(self, lucene_dir_path):
     if lucene_dir_path!=None and lucene_dir_path!='':
         lucene.initVM()
         directory = SimpleFSDirectory(File(lucene_dir_path))
         self.indexReader = IndexReader.open(directory)
         self.is_init=True
     else:
         self.is_init=False
예제 #10
0
파일: retriever.py 프로젝트: kevkid/YIF
def SearchQuery(queryString, fields, classification): 
    #if __name__ == "__main__":
    #if __name__ == "retriever":
    location = web.__path__[0] + "/static/web/files/index/index.articles"
    #lucene.initVM()
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
    reader = IndexReader.open(SimpleFSDirectory(File(location)))
    searcher = IndexSearcher(reader)
    #multi field query: http://stackoverflow.com/questions/2005084/how-to-specify-two-fields-in-lucene-queryparser
    
    query = MultiFieldQueryParser(Version.LUCENE_4_10_1, fields, analyzer)
    #query.setDefaultOperator(QueryParserBase.AND_OPERATOR)
    query = MultiFieldQueryParser.parse(query, queryString)
    #query.parse(queryString)#"Shigella sonnei"
    #query = QueryParser(Version.LUCENE_4_10_1, "abstract", analyzer).parse(queryString)#"Shigella sonnei"

    MAX = 10000
    hits = searcher.search(query, MAX)
 
    print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query)
    paths = []
    pmcids = []
    documentDict = {}
    for hit in hits.scoreDocs:
        doc = searcher.doc(hit.doc)
        pmcids.append(doc.get("pmcid"))
        docDict = {"title" : doc.get("title")}#we can add any other field we want...
        documentDict[doc.get("pmcid")] = docDict 
    
    #Where we get the images for all the pmcids    
    images = get_image_pmcid(pmcids, classification)#should take in pmcids and class
    #create dictionary of images with pmcid being their key
    imagesDict = {}
    for img in images:
        img_pmcid = img.get("pmcid") 
        if img_pmcid in imagesDict.keys():
            imagesDict[img_pmcid].append(img.get("filepath") + "/" + img.get("figureid"))
            
        else:
            imagesDict[img_pmcid] = [(img.get("filepath") + "/" + img.get("figureid"))]
            
    #for each pmcid, we will assign an image to it for the search results
    for pmcid in pmcids:
        if imagesDict:
            docDict = documentDict[pmcid]
            docDict["imgURL"] = imagesDict[pmcid][0] 
            documentDict[pmcid] = docDict 
        else:
            docDict = documentDict[pmcid]
            docDict["imgURL"] = "images/NoImageAvailable.jpg"
            documentDict[pmcid] = docDict
    
    #END - Where we get the images for all the pmcids
    
    
    return documentDict
예제 #11
0
 def __init__(
     self,
     lucene_index_dir='/dccstor/cssblr/amrita/dialog_qa/code/prepro_lucene/lucene_index/'
 ):
     lucene.initVM(vmargs=['-Djava.awt.headless=true'])
     directory = SimpleFSDirectory(File(lucene_index_dir))
     self.searcher = IndexSearcher(DirectoryReader.open(directory))
     self.num_docs_to_return = 5
     self.ireader = IndexReader.open(directory)
예제 #12
0
 def load_index(self):
     indexDir = File(self.index_path)
     a = {"code": self.porter_analyzer}
     self.analyzer = PerFieldAnalyzerWrapper(KeywordAnalyzer(), a)
     index = SimpleFSDirectory(indexDir)
     self.reader = IndexReader.open(index)
     n_docs = self.reader.numDocs()
     self.searcher = IndexSearcher(self.reader)
     print("Index contains %d documents." % n_docs)
def lucene_retrieval_multifield(q_string, q_class, feature_type, use_BM25=False):
    """
    multifield: different query string for different field
    not same word on different field
    :param q_string:
    :param feature_type:
    :param use_BM25:
    :return: retrieval_scores for each question-answer pair
    """
    index = set_lucene_index['ind']  # nonlocal variable index

    def retrieval_scores(hists):
        """
        return sorted document+score by score
        :param hists:
        """
        def doc_score(hists):
            """
            return doc_name & score
            :param hists:
            """
            for h in hists:
                # docID = h.doc
                # doc = searcher.doc(docID)
                # file_name = doc.get("corpus_name")
                # doc_name = doc.get("doc_name")
                # text = doc.get("text")
                score = h.score
                # yield (file_name, doc_name, score, text)
                yield score
        doc_score_list = list(doc_score(hists))
        return map(lambda f: f(doc_score_list), feature_type)  # feature_type is a list of function

    text_query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string))
    subject_query = QueryParser(version, 'corpus_name', analyzer).parse(QueryParser.escape(q_class))
    query = BooleanQuery()

    # BooleanClause.Occur
    # MUST implies that the keyword must occur
    #  SHOULD implies that the keyword SHOULD occur
    query.add(text_query, BooleanClause.Occur.SHOULD)
    query.add(subject_query, BooleanClause.Occur.SHOULD)

    # search
    reader = IndexReader.open(index)
    searcher = IndexSearcher(reader)

    if use_BM25:
        searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75))  # todo: BM25 parameters

    collector = TopScoreDocCollector.create(hitsPerPage, True)
    searcher.search(query, collector)
    hs = collector.topDocs().scoreDocs  # hists

    results = retrieval_scores(hs)
    # reader.close()
    return results  # retrieval_scores for each question-answer pair
예제 #14
0
def search_results(search):
    results = []
    search_string = search.data['search']

    print "buscando:" + search_string

    lucene.initVM()
    analyzer = SpanishAnalyzer()
    reader = IndexReader.open(SimpleFSDirectory(File(indexDirectory)))
    searcher = IndexSearcher(reader)

    query = QueryParser("text", analyzer).parse(search_string)
    MAX = 1000
    hits = searcher.search(query, MAX)

    if not hits.totalHits:
        flash('No results found!')
        return redirect('/')
    else:

        # print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query)
        flash("Found %d document(s) that matched query '%s':" %
              (hits.totalHits, query))
        # render_template('index.html', form=search)
        items = []
        for hit in hits.scoreDocs:
            # print hit.score, hit.doc, hit.toString()
            if len(items) > 10:
                flash('Returning only first 10 results')
                break

            doc = searcher.doc(hit.doc)
            items.append(
                Row(
                    hit.score,
                    doc.get("tipo_sesion"),
                    doc.get("organo"),
                    doc.get("presidente"),
                    doc.get("dia") + "/" + doc.get("mes") + "/" +
                    doc.get("anio"),
                    doc.get("tipo_epigrafe"),
                    doc.get("text").replace(
                        search_string, '<span class="highlightme">' +
                        search_string + '</span>'),
                    # doc.get("filename") # + "<a href='#'>hola"
                    '<a href="' + doc.get("filename") + '">' +
                    doc.get("filename") + '</a>'))
        #     print hit.score, hit.doc, hit.toString()
        #     doc = searcher.doc(hit.doc)
        #     # print doc.get("text").encode("utf-8")
        #     print

        # display results
        return render_template('index.html',
                               form=search,
                               items=items,
                               search_string=search_string)
예제 #15
0
def lucene_retrieval(q_string, feature_type, use_BM25=False):
    """

    :param q_string:
    :param feature_type:
    :param use_BM25:
    :return: retrieval_scores for each question-answer pair
    """
    index = set_lucene_index['ind']  # nonlocal variable index

    def retrieval_scores(hists):
        """
        return sorted document+score by score
        :param hists:
        """
        def doc_score(hists):
            """
            return doc_name & score
            :param hists:
            """
            for h in hists:
                # docID = h.doc
                # doc = searcher.doc(docID)
                # file_name = doc.get("corpus_name")
                # doc_name = doc.get("doc_name")
                # text = doc.get("text")
                score = h.score
                # yield (file_name, doc_name, score, text)
                yield score

        doc_score_list = list(doc_score(hists))
        return map(lambda f: f(doc_score_list),
                   feature_type) if len(doc_score_list) != 0 else [0] * len(
                       feature_type)  # feature_type is a list of function

    # escape special characters via escape function
    query = QueryParser(version, 'text',
                        analyzer).parse(QueryParser.escape(q_string))

    # search
    reader = IndexReader.open(index)
    searcher = IndexSearcher(reader)

    if use_BM25:
        searcher.setSimilarity(BM25Similarity(k1=1.5,
                                              b=0.75))  # todo: BM25 parameters

    collector = TopScoreDocCollector.create(hitsPerPage, True)
    searcher.search(query, collector)
    hs = collector.topDocs().scoreDocs  # hists

    results = retrieval_scores(hs)
    # reader.close()
    return results  # retrieval_scores for each question-answer pair
예제 #16
0
    def xmlrpc_getStatistics(self, instance):
        reader = IndexReader.open(self.indexPath)

        filter = RangeFilter('instance', instance, instance, 1, 1)

        num = filter.bits(reader).cardinality()

        stat = Vector()
        stat.add(num)
        stat.add(0)#len(index.terms()))
        reader.close()
        return stat
예제 #17
0
 def __init__(self):
     #self.segmentor.load('./cws.model')
     INDEXDIR = './Myindex'
     #lucene.initVM(vmargs='-Xcheck:jni,-verbose:jni,-verbose:gc')
     lucene.initVM(vmargs=['-Djava.awt.headless=true'])
     #vm_env = lucene.getVMEnv()
     #vm_env.attachCurrentThread()
     #lucene.initVM(vmargs='-')
     #print 'lucene', lucene.VERSION
     self.directory = SimpleFSDirectory(File(INDEXDIR))
     self.searcher = IndexSearcher(DirectoryReader.open(self.directory))
     self.analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
     self.reader = IndexReader.open(self.directory)
예제 #18
0
def get_wiki_docids(data_file, wikipedia_index):
    from questions import get_input_data
    data = get_input_data(data_file)

    lucene.initVM()
    stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
    for s in stopwords:
        stops.add(s)
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
    reader = IndexReader.open(SimpleFSDirectory(File(wikipedia_index)))
    searcher = IndexSearcher(reader)

    generate_docids(data, data_file, analyzer, searcher)
예제 #19
0
def group_tests():

    TP = 0.0
    FN = 0.0
    n = 0.0
    precision = 0
    recall = 0

    lucene.initVM()
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
    reader = IndexReader.open(SimpleFSDirectory(File("./articleTitleIndex/")))
    searcher = IndexSearcher(reader)
    with open('Labeled800Queries/labeler3.txt', 'r') as f:
        for line in f:
            n += 1
            line = line.split('\t')
            user_query = line[0]
            labels = line[1:]
            user_query = re.sub('[^0-9a-zA-Z]+', ' ', user_query)
            print user_query
            print labels
            res =  predict(user_query, analyzer, reader, searcher, test = "group")

            converted_res = []
            for label in res:
                #print label[0]
                converted_res.append(cvt.WikiToKDD[label[0].replace('_', ' ')])

            if not res:
                print "empty goal category set"
            print converted_res

            """ compare labels and converted_res """
            for label in labels:
                label = label.replace('\r', '')
                label = label.replace('\n', '')
                if label not in cvt.WikiToKDD.values():
                    continue
                #print label
                if label in converted_res:
                    TP += 1.0
                else:
                    FN += 1.0
            
            print "=========================================================="

    precision = TP/(SIZE*n)
    recall = TP/(TP+FN)

    print "precision:", precision
    print "recall:", recall
예제 #20
0
def get_wiki_docids(data_file, wikipedia_index):
    from questions import get_input_data

    data = get_input_data(data_file)

    lucene.initVM()
    stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
    for s in stopwords:
        stops.add(s)
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
    reader = IndexReader.open(SimpleFSDirectory(File(wikipedia_index)))
    searcher = IndexSearcher(reader)

    generate_docids(data, data_file, analyzer, searcher)
def lucene_retrieval(q_string, feature_type, use_BM25=False):
    """

    :param q_string:
    :param feature_type:
    :param use_BM25:
    :return: retrieval_scores for each question-answer pair
    """
    index = set_lucene_index['ind']  # nonlocal variable index

    def retrieval_scores(hists):
        """
        return sorted document+score by score
        :param hists:
        """
        def doc_score(hists):
            """
            return doc_name & score
            :param hists:
            """
            for h in hists:
                # docID = h.doc
                # doc = searcher.doc(docID)
                # file_name = doc.get("corpus_name")
                # doc_name = doc.get("doc_name")
                # text = doc.get("text")
                score = h.score
                # yield (file_name, doc_name, score, text)
                yield score
        doc_score_list = list(doc_score(hists))
        return map(lambda f: f(doc_score_list), feature_type) if len(doc_score_list) != 0 else [0]*len(feature_type) # feature_type is a list of function

    # escape special characters via escape function
    query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string))

    # search
    reader = IndexReader.open(index)
    searcher = IndexSearcher(reader)

    if use_BM25:
        searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75))  # todo: BM25 parameters

    collector = TopScoreDocCollector.create(hitsPerPage, True)
    searcher.search(query, collector)
    hs = collector.topDocs().scoreDocs  # hists

    results = retrieval_scores(hs)
    # reader.close()
    return results  # retrieval_scores for each question-answer pair
예제 #22
0
def make_request(query, analyzer, index, qparser_regexp=None, max_results=100):
    """ Returns list of Score objects, which exposes `doc` (document ID in lucene) and `score` attrs
    :param query:
    :param analyzer:
    :param index:
    :param qparser_regexp:
    :param max_results:
    :return:
    """
    reader = IndexReader.open(SimpleFSDirectory(File(index)))
    searcher = IndexSearcher(reader)

    query = QueryParser(Version.LUCENE_30, "content", analyzer).parse(
        query if not qparser_regexp else re.sub(qparser_regexp, " ", query))
    hits = searcher.search(query, max_results)
    return hits.scoreDocs
예제 #23
0
def make_request(query, analyzer, index, qparser_regexp=None, max_results=100):
    """
    :param query:
    :param analyzer:
    :param index:
    :param qparser_regexp:
    :param max_results:
    :return:
    """
    reader = IndexReader.open(SimpleFSDirectory(File(index)))
    searcher = IndexSearcher(reader)

    query = QueryParser(Version.LUCENE_30, "content", analyzer).parse(
        query if not qparser_regexp else re.sub(qparser_regexp, " ", query))
    hits = searcher.search(query, max_results)
    return hits.scoreDocs
예제 #24
0
 def find(self, query):
     transformer = StringTransformer()
     analyzer = EnglishAnalyzer(Version.LUCENE_CURRENT)
     reader = IndexReader.open(SimpleFSDirectory(File("index/")))
     searcher = IndexSearcher(reader)
     searcher.setSimilarity(BM25Similarity())
     processed_query = ' '.join(
         self._preprocessor(transformer.transform(query)))
     query = QueryParser(Version.LUCENE_CURRENT, "content",
                         analyzer).parse(processed_query)
     hits = searcher.get_description(query, 10)
     result_list = []
     for hit in hits.scoreDocs:
         doc = searcher.doc(hit.doc)
         result_list.append(doc.get("path").encode("utf-8"))
     return result_list
예제 #25
0
def delete(primary_keys_map, collection_name, todelete, commit=False):
    INDEX_DIR_DEFAULT = "IndexFiles.index"
    if collection_name != "DEFAULT":
        INDEX_DIR = collection_name
    else:
        INDEX_DIR = INDEX_DIR_DEFAULT

    try:
        tofind_keyvalue_pairs = json.loads(todelete)
    except:
        return 100

    direc = SimpleFSDirectory(File(INDEX_DIR))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    #setting writer configurations
    try:
        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
        writer = IndexWriter(direc, config)
        ireader = IndexReader.open(direc)
    except:
        return 105

    ###as of now deletion of documents support is only based on indexed keys.###################3
    tofind_primary_keyvalue_pairs = {}
    tofind_nonprimary_keyvalue_pairs = {}

    #separating out primary and non_primary keys
    for key in tofind_keyvalue_pairs.keys():
        if key in primary_keys_map:
            tofind_primary_keyvalue_pairs[key] = tofind_keyvalue_pairs[key]
        else:
            tofind_nonprimary_keyvalue_pairs[key] = tofind_keyvalue_pairs[key]

    #filtering documents according to primary keys
    query = BooleanQuery()
    for key in tofind_primary_keyvalue_pairs.keys():
        temp = QueryParser(Version.LUCENE_CURRENT, key,
                           analyzer).parse(tofind_primary_keyvalue_pairs[key])
        query.add(BooleanClause(temp, BooleanClause.Occur.MUST))

    a = writer.deleteDocuments(query)
    if commit == True:
        writer.commit()
    writer.close()
    return 000
def delete(primary_keys_map,collection_name,todelete,commit=False):
	INDEX_DIR_DEFAULT="IndexFiles.index"
	if collection_name!="DEFAULT":
		INDEX_DIR=collection_name
	else:
		INDEX_DIR=INDEX_DIR_DEFAULT

	try:
		tofind_keyvalue_pairs=json.loads(todelete)
	except:
		return 100	
	

	direc=SimpleFSDirectory(File(INDEX_DIR))
	analyzer=StandardAnalyzer(Version.LUCENE_CURRENT)

	#setting writer configurations
	try:
		config=IndexWriterConfig(Version.LUCENE_CURRENT,analyzer)
		config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
		writer=IndexWriter(direc,config)
		ireader=IndexReader.open(direc)
	except:
		return 105

	###as of now deletion of documents support is only based on indexed keys.###################3 
	tofind_primary_keyvalue_pairs={}
	tofind_nonprimary_keyvalue_pairs={}

	#separating out primary and non_primary keys
	for key in tofind_keyvalue_pairs.keys():
		if key in primary_keys_map:
			tofind_primary_keyvalue_pairs[key]=tofind_keyvalue_pairs[key]
		else:
			tofind_nonprimary_keyvalue_pairs[key]=tofind_keyvalue_pairs[key]

	#filtering documents according to primary keys		
	query=BooleanQuery()
	for key in tofind_primary_keyvalue_pairs.keys():
		temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(tofind_primary_keyvalue_pairs[key])
		query.add(BooleanClause(temp,BooleanClause.Occur.MUST))

	a=writer.deleteDocuments(query)
	if commit==True:
		writer.commit()
	writer.close()
	return 000;
예제 #27
0
def make_request(query, analyzer, index, qparser_regexp=None, max_results=100):
    """
    :param query:
    :param analyzer:
    :param index:
    :param qparser_regexp:
    :param max_results:
    :return:
    """
    reader = IndexReader.open(SimpleFSDirectory(File(index)))
    searcher = IndexSearcher(reader)

    query = QueryParser(Version.LUCENE_30, "content", analyzer).parse(
        query if not qparser_regexp else re.sub(qparser_regexp, " ", query)
    )
    hits = searcher.search(query, max_results)
    return hits.scoreDocs
def number(collection_name):
	if collection_name!="DEFAULT":
		INDEX_DIR=collection_name
	else:
		INDEX_DIR=INDEX_DIR_DEFAULT
		
	direc=SimpleFSDirectory(File(INDEX_DIR))
  	analyzer=StandardAnalyzer(Version.LUCENE_CURRENT)
  	try:
  		ireader=IndexReader.open(direc)
  	except:
  		return 105
  	numdocs = int(ireader.numDocs())

  	ireader.close()
  	
  	return numdocs
예제 #29
0
def make_request(query, analyzer, index, qparser_regexp=None, max_results=100):
    """ Returns list of Score objects, which exposes `doc` (document ID in lucene) and `score` attrs
    :param query:
    :param analyzer:
    :param index:
    :param qparser_regexp:
    :param max_results:
    :return:
    """
    reader = IndexReader.open(SimpleFSDirectory(File(index)))
    searcher = IndexSearcher(reader)

    query = QueryParser(Version.LUCENE_30, "content", analyzer).parse(
        query if not qparser_regexp else re.sub(qparser_regexp, " ", query)
    )
    hits = searcher.search(query, max_results)
    return hits.scoreDocs
예제 #30
0
def lucene_retrieval(q_string, use_BM25=False):
    """

    :param q_string:
    :param use_BM25:
    :return: retrieval_scores for each question-answer pair
    """
    index = set_lucene_index['ind']  # nonlocal variable index

    def doc_text(hists):
        """
        return doc_name & score
        :param hists:
        """
        text = '_NONE_'
        for h in hists:
            docID = h.doc
            doc = searcher.doc(docID)
            # file_name = doc.get("corpus_name")
            # doc_name = doc.get("doc_name")
            text = doc.get("text")
            #score = h.score
            # yield (file_name, doc_name, score, text)
        return text

    result = '_NONE_'

    # escape special characters via escape function
    if q_string and q_string.strip():   # when pre-process answers, `none of the above` -> '' cause error here
        #print(q_string)
        query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string))

        # search
        reader = IndexReader.open(index)
        searcher = IndexSearcher(reader)

        if use_BM25:
            searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75))  # todo: BM25 parameters

        collector = TopScoreDocCollector.create(hitsPerPage, True)
        searcher.search(query, collector)
        hs = collector.topDocs().scoreDocs  # hists
        result = doc_text(hs)

        # reader.close()
    return result  # text: also nodes
예제 #31
0
def number(collection_name):
    if collection_name != "DEFAULT":
        INDEX_DIR = collection_name
    else:
        INDEX_DIR = INDEX_DIR_DEFAULT

    direc = SimpleFSDirectory(File(INDEX_DIR))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    try:
        ireader = IndexReader.open(direc)
    except:
        return 105
    numdocs = int(ireader.numDocs())

    ireader.close()

    return numdocs
예제 #32
0
    def get_sorted_results(self, query):
        SHOULD = BooleanClause.Occur.SHOULD
        parsed_query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT, query, ['docno', 'content'], [SHOULD, SHOULD], self.analyzer)

        reader = IndexReader.open(self.directory)
        searcher = IndexSearcher(reader)

        searcher.setSimilarity(BM25Similarity())
        topDocs = searcher.search(parsed_query, 10)

        j = 0
        for i in topDocs.scoreDocs:
            d = searcher.doc(i.doc)

            print 'No. %02d: ' % (j + 1) + d['docno'] + ' ' + str(i.score)

            j += 1
예제 #33
0
파일: retriever.py 프로젝트: kevkid/YIF
def getDocumentPMC_ID(pmcid, imageAndTitle = 0):
    location = web.__path__[0] + "/static/web/files/index/index.articles"
    #lucene.initVM()
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
    reader = IndexReader.open(SimpleFSDirectory(File(location)))
    searcher = IndexSearcher(reader)
    #multi field query: http://stackoverflow.com/questions/2005084/how-to-specify-two-fields-in-lucene-queryparser
    query = QueryParser(Version.LUCENE_4_10_1, "pmcid", analyzer).parse(pmcid)#"Shigella sonnei"
    MAX = 1000
    hits = searcher.search(query, MAX)
    title = ""
    abstract = ""
    fullText = "http://www.ncbi.nlm.nih.gov/pmc/articles/" + pmcid + "/"
    doi = ""#need to split
    
    volume = ""
    year = ""
    publisher = ""
    for hit in hits.scoreDocs:#should only be one
        #print hit.score, hit.doc, hit.toString()
        doc = searcher.doc(hit.doc)
        if(imageAndTitle == 1):
            paths = []
            paths.append(doc.get("articlepath"))
            image = get_image(paths)
            
        abstract = doc.get("abstract")
        doi = doc.get("doi")
        title = doc.get("title")
        volume = doc.get("volume")
        year = doc.get("year")
        publisher = doc.get("publisher")
    if doi is not None:
        doiSecond = doi.split('/')
        doiSecond = doiSecond[1]#second part
    else:
        doiSecond = ""
    #http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3363814/pdf/cc11003.pdf
    pdf = "http://www.ncbi.nlm.nih.gov/pmc/articles/" + pmcid + "/pdf/" + doiSecond + ".pdf" 
    if(imageAndTitle == 1):
        return title, image, pmcid#image may sometimes show up    
    else:
        return abstract, doi, title, volume, year, publisher, fullText, pdf,pmcid#image may sometimes show up
예제 #34
0
def individual_test():

    user_query = "microsoft forms"
    lucene.initVM()
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
    reader = IndexReader.open(SimpleFSDirectory(File("./articleTitleIndex_withTitle/")))
    searcher = IndexSearcher(reader)
    res =  predict(user_query, analyzer, reader, searcher)
    print "goal_categories:"
    print res
    converted_res = []
    for label in res:
        #print label[0]
        converted_res.append(cvt.WikiToKDD[label[0].replace('_', ' ')])
    if not res:
        print "empty goal category set"
    print "converted goal_categories:"
    print converted_res
예제 #35
0
    def xmlrpc_unindexDocument(self, instance, id):
        """ Unindex document """
        filter = BooleanFilter()

        filter.add(FilterClause(RangeFilter('id', id, id, 1, 1),
                                BooleanClause.Occur.MUST))
        filter.add(FilterClause(RangeFilter('instance', instance, instance, 1, 1),
                                BooleanClause.Occur.MUST))
        
        reader = IndexReader.open(self.indexPath)

        bits = filter.bits(reader)

        docId = bits.nextSetBit(0)
        while docId >= 0:
            reader.deleteDocument(docId)
            docId = bits.nextSetBit(docId+1)

        reader.close()
예제 #36
0
def searchLucene(requestParameter):
    "this method is used to search Lucene"
    searchResults = []
    requestParameter = requestParameter.replace("/"," ")
    # 1. open the index
    if __name__ == "luceneSearch":
        lucene.initVM()
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
    index = SimpleFSDirectory(File("Home/WishMatcherIndex"))
    reader = IndexReader.open(index)
    n_docs = reader.numDocs()
    print("Index contains %d documents." % n_docs)

    # 2. parse the query from the command line
    fields=["AdLine","FieldString","FieldRelatedWords"]    
    parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields, analyzer)
    parser.setDefaultOperator(QueryParserBase.OR_OPERATOR)
    query = MultiFieldQueryParser.parse(parser,requestParameter)
    print(query)

    # 3. search the index for the query
    # We retrieve and sort all documents that match the query.
    # In a real application, use a TopScoreDocCollector to sort the hits.
    searcher = IndexSearcher(reader)
    hits = searcher.search(query, n_docs).scoreDocs

    # 4. display results
    print("Found %d hits:" % len(hits))
    for i, hit in enumerate(hits):
        doc = searcher.doc(hit.doc)
        product = doc.get("AdLine")
        url = doc.get("URL")
        if(doc.get("AdId") != 1200):
            product = product[:-1]
            url = url[:-1]
        print("%d. %s" % (i + 1, doc.get("AdLine")))
        r = result(str(product),str(url))
        searchResults.append(r)

    # 5. close resources
    #searcher.close()
    print(searchResults)
    return searchResults
예제 #37
0
def get_wiki_nums(data_file, wikipedia_index) :
	lucene.initVM()
	reader = IndexReader.open(SimpleFSDirectory(File(wikipedia_index)))
	searcher = IndexSearcher(reader)
	id_file = open(data_file + '.docid')
	num_file = open(data_file + '.nums', 'w')
	what = []
	for line in id_file :
		line = line.strip()
		if len(line) == 0 :
			continue
		line = line.split('\t')
		if len(line) == 2 and int(line[1]) not in [-1, 0, 1, 2, 3]:
			what.append(int(line[1]))

	what = list(set(what))

	for item in what :
		num_file.write(str(item) + '\t' + searcher.doc(item).get("num").encode('utf-8') + '\n')
예제 #38
0
    def load_index(self):
        indexDir = File(self.index_path)
        porter_analyzer = PorterAnalyzer(
            StandardAnalyzer(Version.LUCENE_CURRENT))
        a = {
            "typed_method_call": KeywordAnalyzer(),
            "extends": KeywordAnalyzer(),
            "used_classes": KeywordAnalyzer(),
            "methods": KeywordAnalyzer(),
            "class_instance_creation": KeywordAnalyzer(),
            "id": KeywordAnalyzer(),
            "code": JavaCodeAnalyzer()
        }

        self.analyzer = PerFieldAnalyzerWrapper(porter_analyzer, a)
        index = SimpleFSDirectory(indexDir)
        self.reader = IndexReader.open(
            index)  #IndexReader 열고 닫지 않았었음...........................
        n_docs = self.reader.numDocs()
        print("Index contains %d documents." % n_docs)
예제 #39
0
    def load_index(self):
        indexDir = File(self.index_path)
        a = {
            "code": self.porter_analyzer,
            "description": self.porter_analyzer,
            "typed_method_call": KeywordAnalyzer(),
            "extends": KeywordAnalyzer(),
            "used_classes": KeywordAnalyzer(),
            "methods": KeywordAnalyzer(),
            "class_instance_creation": KeywordAnalyzer(),
            "id": KeywordAnalyzer()
        }
        self.analyzer = PerFieldAnalyzerWrapper(self.porter_analyzer, a)

        index = SimpleFSDirectory(indexDir)
        self.reader = IndexReader.open(index)
        n_docs = self.reader.numDocs()
        self.searcher = IndexSearcher(self.reader)
        print("\nLoading Indices... GitHub index contains [%d] documents." %
              n_docs)
예제 #40
0
    def get_sorted_results(self, query):
        SHOULD = BooleanClause.Occur.SHOULD
        parsed_query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT,
                                                   query, ['docno', 'content'],
                                                   [SHOULD, SHOULD],
                                                   self.analyzer)

        reader = IndexReader.open(self.directory)
        searcher = IndexSearcher(reader)

        searcher.setSimilarity(BM25Similarity())
        topDocs = searcher.search(parsed_query, 10)

        j = 0
        for i in topDocs.scoreDocs:
            d = searcher.doc(i.doc)

            print 'No. %02d: ' % (j + 1) + d['docno'] + ' ' + str(i.score)

            j += 1
예제 #41
0
def getSimilarityGenerator(field,minTermFreq,minDocFreq,minWordLen): # maxQueryTerms as parameter
    maxQueryTerms = 30
    location = web.__path__[0] + "/static/web/files/index/index.articles"
    #lucene.initVM()
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    reader = IndexReader.open(SimpleFSDirectory(File(location)))
    
    simil = MoreLikeThis(reader)
    simil.setFieldNames(field)
    simil.setMinTermFreq(minTermFreq)
    simil.setMinDocFreq(minDocFreq)
    simil.setMinWordLen(minWordLen)

    #Use same maxQueryTerms to prevent longer queries
    simil.setMaxQueryTerms(maxQueryTerms)

    simil.setBoost(True) #!Boost terms within queries by tf-idf score
    return simil
    
    
예제 #42
0
파일: retriever.py 프로젝트: kevkid/YIF
def getRandomDoc():
    
        location = web.__path__[0] + "/static/web/files/index/index.figures"
        #lucene.initVM()
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()
        analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
        reader = IndexReader.open(SimpleFSDirectory(File(location)))
        searcher = IndexSearcher(reader)
     
        #query = QueryParser(Version.LUCENE_4_10_1, "keywords", analyzer).parse(queryString)#"Shigella sonnei"
        MAX = 1000
        docNum = randrange(0, reader.maxDoc())
        doc = reader.document(docNum)
        
        fileName = doc.get("filename")
        filePath = doc.get("filepath")
            
        result = filePath + "/" + fileName 
        result = result.replace("/home/kevin/Downloads/","/")
        return (result, docNum)
예제 #43
0
def evaluate_index(data_dir, store_dir, analyzer):
    """
    Evaluates vocabulary size and indexing speed for different
    analyzer configurations.
    """
    start = time.clock()
    Indexer(data_dir, store_dir, analyzer)
    end = time.clock()
    duration = end-start

    directory = SimpleFSDirectory(File(store_dir))
    reader = IndexReader.open(directory)
    vocabulary = MultiFields.getTerms(reader, 'title')
    vocab_size = vocabulary.size()

    # sometimes .size() doesn't return the correct size, in this case
    # we have to count manually
    if vocab_size == -1:
        termsref = BytesRefIterator.cast_(vocabulary.iterator(None))
        vocab_size = sum(1 for _ in termsref)

    reader.close()
    return duration, vocab_size
예제 #44
0
파일: retriever.py 프로젝트: kevkid/YIF
def getDocumentClass(reqClass):
    import random
    location = web.__path__[0] + "/static/web/files/index/index.figures"
    #lucene.initVM()
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
    reader = IndexReader.open(SimpleFSDirectory(File(location)))
    searcher = IndexSearcher(reader)
    #multi field query: http://stackoverflow.com/questions/2005084/how-to-specify-two-fields-in-lucene-queryparser
    query = QueryParser(Version.LUCENE_4_10_1, "class", analyzer).parse(str(random.choice(reqClass)))#get random class and search for a document from here
    MAX = 1000
    hits = searcher.search(query, MAX)
    docs = []
    for hit in hits.scoreDocs:#should only be one
        #print hit.score, hit.doc, hit.toString()
        docs.append((searcher.doc(hit.doc), hit.doc))
    
    if not docs:
        return -1
    else:
        doc = random.choice(docs)
        return doc#return document and ID
예제 #45
0
파일: common.py 프로젝트: zoubiao1/pylucene
 def __init__(self, indexDir,
              computeLengthNorm=True):  # 初始化 indexDir-索引文件目录 computeLengthNorm-是否应用SIM(true-不用 false-应用)
     #         if not jpype.isJVMStarted():
     #         lucene.initVM()
     lucene.getVMEnv().attachCurrentThread()
     self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)  # 标准分词 在针对英文时 以分隔符分词
     self.path = os.path.join(INDEX_PATH, indexDir)  # 存储路径
     self.store = SimpleFSDirectory(File(self.path))  # 存储***?
     # self.reader = DirectoryReader.open(self.store)
     self.reader = IndexReader.open(self.store)
     self.numDocs = self.reader.maxDoc()
     self.searcher = IndexSearcher(self.reader)  # IndexSearch类
     sim = CustomSimilarity()  # addby zmq
     if not computeLengthNorm:  # SIM
         sim = CustomSimilarity()
         self.searcher.setSimilarity(sim)
     self.mlt = MoreLikeThis(self.reader, sim)  # mlt?
     self.mlt.setAnalyzer(self.analyzer)
     self.mlt.setMinTermFreq(1)
     self.mlt.setMinDocFreq(1)
     # debug
     self.mlt.setMinWordLen(1)
     self.mlt.setMaxNumTokensParsed(100000000)
     BooleanQuery.setMaxClauseCount(1024 * 1024)  # 修改最长query clause BUG
예제 #46
0
 
from java.io import File
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.document import Document, Field
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.index import IndexReader
from org.apache.lucene.queryparser.classic import QueryParser
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.util import Version
 
if __name__ == "__main__":
    lucene.initVM()
    print "lucene version is:", Version
    # Get the analyzer
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    # Get index storage
    indexDir = SimpleFSDirectory(File("index/"))
    reader = IndexReader.open(SimpleFSDirectory(File("index/")))
    searcher = IndexSearcher(reader)
 
    query = QueryParser(Version.LUCENE_CURRENT, "country", analyzer).parse("India")
    MAX = 1000
    hits = searcher.search(query, MAX)
 
    print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query)
    for hit in hits.scoreDocs:
        print hit.score, hit.doc, hit.toString()
        doc = searcher.doc(hit.doc)
        print doc.get("country").encode("utf-8")
예제 #47
0
 def __init__(self, lucene_index_dir='/data/zjy/csqa_data/lucene_dir/'):
     lucene.initVM(vmargs=['-Djava.awt.headless=true'])
     directory = SimpleFSDirectory(File(lucene_index_dir))
     self.searcher = IndexSearcher(DirectoryReader.open(directory))
     self.num_docs_to_return = 5
     self.ireader = IndexReader.open(directory)
예제 #48
0
    docs = coll.find(BasicDBObject({"question_id": question_id}))

    apis = []
    for doc in docs:
        answer = doc.toMap()
        apis.append(answer["typed_method_call"])

    print apis


indexDir = File("/tmp/stackoverflow")

# 1. open the index
analyzer = PorterAnalyzer(StandardAnalyzer(Version.LUCENE_CURRENT))
index = SimpleFSDirectory(indexDir)
reader = IndexReader.open(index)
n_docs = reader.numDocs()
print("Index contains %d documents." % n_docs)

# 2. parse the query from the command line
a = {"typed_method_call": KeywordAnalyzer(), "extends": KeywordAnalyzer()}
wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a)

query_string = "lucene get similar documents to the current one"
query_parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, ["title"],
                                     wrapper_analyzer)

#base_query = getSpanNearQuery(analyzer, query_string)

base_query = query_parser.parse(query_string)
예제 #49
0
from org.apache.lucene.queryparser.classic import QueryParser
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.util import Version

if __name__ == "__main__":

    if len(sys.argv) != 3:

        print sys.argv[0] + ' <stopWords> <searchString>'
        exit()

    indexDirectory = sys.argv[1]
    searchString = sys.argv[2]

    lucene.initVM()
    analyzer = SpanishAnalyzer()
    reader = IndexReader.open(SimpleFSDirectory(File(indexDirectory)))
    searcher = IndexSearcher(reader)

    query = QueryParser("text", analyzer).parse(searchString)
    MAX = 1000
    hits = searcher.search(query, MAX)

    print "Found %d document(s) that matched query '%s':" % (hits.totalHits,
                                                             query)
    for hit in hits.scoreDocs:
        print hit.score, hit.doc, hit.toString()
        doc = searcher.doc(hit.doc)
        # print doc.get("text").encode("utf-8")
        print doc.get("filename")
예제 #50
0
 def open(self):
     debug('Opening index "%s"' % self.indexPath)
     self.reader = IndexReader.open(self.indexDir)
     self.searcher = IndexSearcher(self.reader)
     self.totalDocs = self.getTotalSentenceCount()
예제 #51
0
 def __init__(self, index_path):
     indexDir = File(index_path)
     index = SimpleFSDirectory(indexDir)
     self.reader = IndexReader.open(index)
     n_docs = self.reader.numDocs()
     self.searcher = IndexSearcher(self.reader)
예제 #52
0
def lucene_retrieval_multifield(q_string,
                                q_class,
                                feature_type,
                                use_BM25=False):
    """
    multifield: different query string for different field
    not same word on different field
    :param q_string:
    :param feature_type:
    :param use_BM25:
    :return: retrieval_scores for each question-answer pair
    """
    index = set_lucene_index['ind']  # nonlocal variable index

    def retrieval_scores(hists):
        """
        return sorted document+score by score
        :param hists:
        """
        def doc_score(hists):
            """
            return doc_name & score
            :param hists:
            """
            for h in hists:
                # docID = h.doc
                # doc = searcher.doc(docID)
                # file_name = doc.get("corpus_name")
                # doc_name = doc.get("doc_name")
                # text = doc.get("text")
                score = h.score
                # yield (file_name, doc_name, score, text)
                yield score

        doc_score_list = list(doc_score(hists))
        return map(lambda f: f(doc_score_list),
                   feature_type)  # feature_type is a list of function

    text_query = QueryParser(version, 'text',
                             analyzer).parse(QueryParser.escape(q_string))
    subject_query = QueryParser(version, 'corpus_name',
                                analyzer).parse(QueryParser.escape(q_class))
    query = BooleanQuery()

    # BooleanClause.Occur
    # MUST implies that the keyword must occur
    #  SHOULD implies that the keyword SHOULD occur
    query.add(text_query, BooleanClause.Occur.SHOULD)
    query.add(subject_query, BooleanClause.Occur.SHOULD)

    # search
    reader = IndexReader.open(index)
    searcher = IndexSearcher(reader)

    if use_BM25:
        searcher.setSimilarity(BM25Similarity(k1=1.5,
                                              b=0.75))  # todo: BM25 parameters

    collector = TopScoreDocCollector.create(hitsPerPage, True)
    searcher.search(query, collector)
    hs = collector.topDocs().scoreDocs  # hists

    results = retrieval_scores(hs)
    # reader.close()
    return results  # retrieval_scores for each question-answer pair
예제 #53
0
def search(collection_name, tofind):
    if collection_name != "DEFAULT":
        INDEX_DIR = collection_name
    else:
        INDEX_DIR = INDEX_DIR_DEFAULT
    try:
        tofind_keyvalue_pairs = json.loads(tofind)
    except:
        return 100
    direc = SimpleFSDirectory(File(INDEX_DIR))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    try:
        ireader = IndexReader.open(direc)
        searcher = IndexSearcher(ireader)
    except:
        return 105

    #initializing return list
    return_list = []
    #check_list=[]
    tofind_primary_keyvalue_pairs = {}
    tofind_nonprimary_keyvalue_pairs = {}

    #separating out primary and non_primary keys
    for key in tofind_keyvalue_pairs.keys():
        if key in primary_keys_map[collection_name]:
            tofind_primary_keyvalue_pairs[key] = tofind_keyvalue_pairs[key]
        else:
            tofind_nonprimary_keyvalue_pairs[key] = tofind_keyvalue_pairs[key]

    #filtering documents
    if len(tofind_primary_keyvalue_pairs) > 0:
        query = BooleanQuery()
        for key in tofind_primary_keyvalue_pairs.keys():
            temp = QueryParser(Version.LUCENE_CURRENT, key, analyzer).parse(
                tofind_primary_keyvalue_pairs[key])
            query.add(BooleanClause(temp, BooleanClause.Occur.MUST))
        hits = searcher.search(query, MAX_RESULTS).scoreDocs
        for hit in hits:
            doc = searcher.doc(hit.doc)
            if to_be_compressed_map[collection_name] == True:
                temp = doc.get("$DATA$")
                data = snappy.uncompress(base64.b64decode(temp))
            else:
                temp = doc.get("$DATA$")
                data = base64.b64decode(temp)
            #non primary key filtering(without having to load all the primary key filtered values into main memory!)
            if len(tofind_nonprimary_keyvalue_pairs) > 0:
                entry = json.loads(data)
                satisfied = True
                for key in tofind_nonprimary_keyvalue_pairs.keys():
                    if entry.get(key) != tofind_nonprimary_keyvalue_pairs[key]:
                        satisfied = False
                        break
                if satisfied == True:
                    return_list.append(data)
            else:
                return_list.append(data)

    else:
        for i in range(0, ireader.numDocs()):
            doc = searcher.doc(i)
            if to_be_compressed_map[collection_name] == True:
                temp = doc.get("$DATA$")
                data = snappy.uncompress(base64.b64decode(temp))
            else:
                temp = doc.get("$DATA$")
                data = base64.b64decode(temp)

            #non primary key filtering(without having to load all the primary key filtered values into main memory!)
            if len(tofind_nonprimary_keyvalue_pairs) > 0:
                entry = json.loads(data)
                satisfied = True
                for key in tofind_nonprimary_keyvalue_pairs.keys():
                    if entry.get(key) != tofind_nonprimary_keyvalue_pairs[key]:
                        satisfied = False
                        break
                if satisfied == True:
                    return_list.append(data)
            else:
                return_list.append(data)

    ireader.close()

    if len(return_list) == 0:
        return None
    else:
        return return_list
예제 #54
0
lucene.initVM()

# ANALYZER
analyzer = StandardAnalyzer(util.Version.LUCENE_CURRENT)

# DIRECTORY
directory = SimpleFSDirectory(File(luceneIndexPath))

#dont forget to remove the luceneIndexDirectory file everytime you run this code.
'''
# INDEX WRITER
code removed
'''
# INDEX READER
reader = IndexReader.open(directory)
searcher = IndexSearcher(reader)

# QUERYING FOR A QUESTION
queryParser = QueryParser(util.Version.LUCENE_CURRENT, "text", analyzer)

###################### end

###################### creating vectors (infering) for testing

#inference hyper-parameters
tic = time.time()
start_alpha = 0.01
infer_epoch = 1000
testingFilePath = '/home/tarun/PE/testFiles/testNum18000.csv'
예제 #55
0
#!/usr/bin/python
import sys, os

sys.path.append("../lib/lucene-core-3.6.2.jar")
sys.path.append("../lib/lucene-core-3.6.2-javadoc.jar")

from java.io import File
from java.util import Scanner
from org.apache.lucene.index import IndexReader, Term
from org.apache.lucene.store import SimpleFSDirectory
import pdb

if __name__ == "__main__":

    r = IndexReader.open(SimpleFSDirectory(File('../index')))
    print "... total number of documents in the index is " + str(r.maxDoc())
    t = r.terms()
    i = 0
    count_add = 0
    while t.next():
        i = i + 1
        if i > 100010:
            break
        if i > 100000:
            print "[" + str(i) + "]" + t.term().text()

    te = Term("contents", "brute")
    print "... number of documents with the word brute is : " + str(
        r.docFreq(te))
    td = r.termDocs(te)
예제 #56
0
def predict_test(indexed_data, index_destination, source='directory', already_indexed=False):
    """
    :param indexed_data_dir:
    :param index_destination:
    :return:
    """
    def choose_best():
        scores = []
        for k, v in sorted(res.items(), key=lambda x: x[0]):
            scores.append((k, 1. * sum(data_test['correctAnswer'] == v) / len(v)))
        return sorted(scores, key=lambda x: -x[-1])[0][0]
 
    def calculate_score(res):
        """
        :param res:
        :return:
        """
        correct = 0
        total = 0
        for index, row in data_test.iterrows():
            if res[index] == row['correctAnswer']:
                correct += 1
            total += 1
        return float(correct)/total
 
    if not already_indexed:
        make_index(indexed_data, index_destination, source)
 
    res = {}
    MAX = 100
    docs_per_q = range(1,20)

    records = []
 
    #analyzer = StandardAnalyzer(Version.LUCENE_30)
    analyzer = SnowballAnalyzer(Version.LUCENE_30, "English", StandardAnalyzer.STOP_WORDS_SET)
    reader = IndexReader.open(SimpleFSDirectory(File(index_destination)))
    searcher = IndexSearcher(reader)
 
    for index, row in data_test.iterrows():
 
        queries = [row['answerA'], row['answerB'], row['answerC'], row['answerD']]
        queries = [row['question'] + ' ' + q for q in queries]
 
        scores = {}
        for q in queries:
            query = QueryParser(Version.LUCENE_30, "content", analyzer).parse(re.sub("[^a-zA-Z0-9]"," ", q))
            #query = QueryParser(Version.LUCENE_30, "content", analyzer).parse(re.sub("[/^]", "\^", q))
            hits = searcher.search(query, MAX)
            doc_importance = [hit.score for hit in hits.scoreDocs]
            for n in docs_per_q:
                scores.setdefault(n, [])
                scores[n].append(sum(doc_importance[:n]))

        to_records = [index+102501]
        to_records.append(['A','B','C','D'][np.argmax(scores[4])])
        records.append(to_records)

        for n in docs_per_q:
            res.setdefault(n, [])
            res[n].append(['A','B','C','D'][np.argmax(scores[n])])

    df = pandas.DataFrame.from_records(records, columns=["id","correctAnswer"])
    df = df.set_index("id")
    df.to_csv("ololo.csv")

    # print res[4]
    best = choose_best()
    print best
    score = calculate_score(res[best])
    # score = calculate_score(res)
    print score
예제 #57
0
def create_searcher():
    init_lucene()
    reader = IndexReader.open(SimpleFSDirectory(File(INDEX_DIR)))
    return IndexSearcher(reader)