예제 #1
0
def irsolver(data_file, index):
    from questions import get_input_data
    lucene.initVM()
    stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
    for s in stopwords:
        stops.add(s)
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
    reader = IndexReader.open(SimpleFSDirectory(File(index)))
    searcher = IndexSearcher(reader)
    pred = []
    mapp = {1: 'A', 2: 'B', 3: 'C', 4: 'D'}

    idx, ques, ans = get_input_data(data_file)
    for acm, (idq, q, a) in enumerate(zip(idx, ques, ans)):
        max_score = -1000000
        best_ans = 'A'
        for i, ai in enumerate(a):
            sc = query(q, ai, analyzer, searcher)
            print(acm, i, sc)
            if sc > max_score:
                max_score = sc
                best_ans = mapp[i + 1]
        pred.append(best_ans)

    return idx, pred
예제 #2
0
파일: retriever.py 프로젝트: kevkid/YIF
def getRandomDoc2():
    
        location = web.__path__[0] + "/static/web/files/index/index.articles"
        #lucene.initVM()
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()
        analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
        reader = IndexReader.open(SimpleFSDirectory(File(location)))
        searcher = IndexSearcher(reader)
     
        #query = QueryParser(Version.LUCENE_4_10_1, "keywords", analyzer).parse(queryString)#"Shigella sonnei"
        MAX = 1000
        docNum = randrange(0, reader.maxDoc())
        doc = reader.document(docNum)
     
        #print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query)
        files = []
        fileRoots = []
        paths = []
        paths.append(doc.get("articlepath"))
        pth = paths[0].replace("/home/kevin/Downloads/","/home/kevin/git/YIF/imageFinder/web/static/web/")#os.path.join(tools.__path__,"static/web/images")
        for root, directories, filenames in os.walk(pth):#probably something wrong with the location
            for filename in filenames:
                if (".jpg" or ".gif" or ".png") in filename:
                    files.append(root.replace("/home/kevin/git/YIF/imageFinder/web/static/web/","") + "/" +filename)#temp, will need to chance            
                    fileRoots.append(root)
                    print (root.replace("/home/kevin/git/YIF/imageFinder/web/static/web/","") + "/" + filename)
        try: 
            rng = randrange(0, len(files))
        except:
            return -1
        else:
             return files[randrange(0, len(files))]
예제 #3
0
def get_candidates(qatp):

    if prm.create_index:
        create_index()

    lucene.initVM()
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
    reader = IndexReader.open(SimpleFSDirectory(File(prm.index_folder)))
    searcher = IndexSearcher(reader)
    candidates = []
    n = 0
    for q, a, t, p in qatp:
        if n % 100 == 0:
            print 'finding candidates sample', n
        n += 1

        q = q.replace('AND', '\\AND').replace('OR',
                                              '\\OR').replace('NOT', '\\NOT')
        query = QueryParser(Version.LUCENE_4_10_1, "text",
                            analyzer).parse(QueryParser.escape(q))
        hits = searcher.search(query, prm.max_candidates)
        c = []
        for hit in hits.scoreDocs:
            doc = searcher.doc(hit.doc)
            c.append(doc.get("id"))

        candidates.append(c)

    return candidates
예제 #4
0
def shourcut_retriever(keyword):
    '''查询器:在简介中查询'''
    global flag
    if flag:
        lucene.initVM()
    flag = False
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    reader = IndexReader.open(SimpleFSDirectory(File("index/")))
    searcher = IndexSearcher(reader)

    query = QueryParser(Version.LUCENE_4_10_1, "shortcut",
                        analyzer).parse(keyword)
    MAX = 20
    hits = searcher.search(query, MAX)

    print("Found %d document(s) that matched query '%s':" %
          (hits.totalHits, query))
    results = []
    for hit in hits.scoreDocs:
        print(hit.score, hit.doc, hit.toString())
        doc = searcher.doc(hit.doc)
        result = [doc.get('shortcut'), doc.get('url'), doc.get('name')]
        print(doc.get('url'))
        results.append(result)
    return results
예제 #5
0
 def load_index(self):
     indexDir = File(self.index_path)
     index = SimpleFSDirectory(indexDir)
     self.reader = IndexReader.open(index)
     n_docs = self.reader.numDocs()
     self.searcher = IndexSearcher(self.reader)
     print("Index contains %d documents." % n_docs)
예제 #6
0
def get_candidates(qatp):

    if prm.create_index:
        create_index()

    lucene.initVM()
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
    reader = IndexReader.open(SimpleFSDirectory(File(prm.index_folder)))
    searcher = IndexSearcher(reader)
    candidates = []
    n = 0
    for q,a,t,p in qatp:
        if n % 100 == 0:
            print 'finding candidates sample', n
        n+=1

        q = q.replace('AND','\\AND').replace('OR','\\OR').replace('NOT','\\NOT')
        query = QueryParser(Version.LUCENE_4_10_1, "text", analyzer).parse(QueryParser.escape(q))
        hits = searcher.search(query, prm.max_candidates)
        c = []
        for hit in hits.scoreDocs:
            doc = searcher.doc(hit.doc)
            c.append(doc.get("id"))

        candidates.append(c)
        
    return candidates
예제 #7
0
def irsolver(data_file, index) :
	from questions import get_input_data
	lucene.initVM()
	stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
	for s in stopwords :
		stops.add(s)
	analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
	reader = IndexReader.open(SimpleFSDirectory(File(index)))
	searcher = IndexSearcher(reader)
	pred = []
	mapp = { 1 : 'A', 2 : 'B', 3 : 'C', 4 : 'D'}

	idx, ques, ans = get_input_data(data_file)
	for acm, (idq, q, a) in enumerate(zip(idx, ques, ans)) :
		max_score = -1000000
		best_ans = 'A'
		for i, ai in enumerate(a):
			sc = query(q, ai, analyzer, searcher)
			print(acm, i, sc)
			if sc > max_score :
				max_score = sc
				best_ans = mapp[i+1]
		pred.append(best_ans)

	return idx, pred
예제 #8
0
 def __init__(self, index_path):
     # 요 부분은 다 lucene을 통해 인덱스들을 불러오고, reader와 searcher 초기화
     indexDir = File(index_path)
     index = SimpleFSDirectory(indexDir)
     self.reader = IndexReader.open(
         index)  #IndexReader 열고 닫지 않았었음...........................
     self.searcher = IndexSearcher(self.reader)
예제 #9
0
def retrieve(indexdir, queries):
    lucene.initVM()
    f = open("results_lucene.txt", "w")
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
    reader = IndexReader.open(SimpleFSDirectory(File(indexdir)))
    searcher = IndexSearcher(reader)

    fields = ["title", "abstract", "authors"]

    st = PorterStemmer()
    for id, q in queries.iteritems():
        query = q
        tokenizer = RegexpTokenizer(r'\w+')
        qwords = tokenizer.tokenize(query)
        qwords_k = [st.stem(q) for q in qwords]
        query = " ".join(qwords_k)
        parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields, analyzer)
        parser.setDefaultOperator(QueryParserBase.OR_OPERATOR)
        query = MultiFieldQueryParser.parse(parser, query)
        MAX = 1000
        hits = searcher.search(query, MAX)
        # print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query)
        for i, hit in enumerate(hits.scoreDocs):
            f.write("%s Q0 %s %s %s G17R3\n" % (id, hit.doc+1, i+1, hit.score))
            # print hit.doc+1, hit.score
            # doc = searcher.doc(hit.doc)
            # print doc.get("authors").encode("utf-8")
    f.close()
예제 #10
0
파일: retriever.py 프로젝트: kevkid/YIF
def get_image_pmcid(pmcid, classes = ""):
    fields = ["pmcid", "class"]
    docs = []
    location = web.__path__[0] + "/static/web/files/index/index.figures"
    #lucene.initVM()
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
    reader = IndexReader.open(SimpleFSDirectory(File(location)))
    searcher = IndexSearcher(reader)
    # multi field query: http://stackoverflow.com/questions/2005084/how-to-specify-two-fields-in-lucene-queryparser
    
    #query = MultiFieldQueryParser(Version.LUCENE_4_10_1, fields, analyzer)
    # query.setDefaultOperator(QueryParserBase.AND_OPERATOR)
    
    #query = query.parse(query, ('4175339','1'))
    # query.parse(queryString)#"Shigella sonnei"
    # query = QueryParser(Version.LUCENE_4_10_1, "abstract", analyzer).parse(queryString)#"Shigella sonnei"

    MAX = 10000
    #hits = searcher.search(query, MAX)
    if classes == "all":
        queryStr = "pmcid:(" + ' '.join(pmcid) +")"
    else:
        queryStr = "pmcid:(" + ' '.join(pmcid) +")" + " AND class:" + classes
    query = QueryParser(Version.LUCENE_4_10_1, "pmcid",analyzer)#needed to build a custom query
    q = query.parse(queryStr) 
    hits = searcher.search(q, MAX)
    for hit in hits.scoreDocs:#should only be one
        #print hit.score, hit.doc, hit.toString()
        docs.append(searcher.doc(hit.doc))
    return docs #This will return the image documents that belong to a pmcid(article)
예제 #11
0
def evaluate_index(index_dir, context, analyzer):
    # eval time of indexing (overall)
    # we should also measure the elapsed time of
    # each index_document call seperately
    start = time.clock()
    Indexer(index_dir, context, analyzer)
    end = time.clock()
    duration = end-start

    directory = SimpleFSDirectory(File(index_dir))
    reader = IndexReader.open(directory)
    vocabulary = MultiFields.getTerms(reader, 'title')
    vocab_size = vocabulary.size()
    if vocab_size == -1:
        termsref = BytesRefIterator.cast_(vocabulary.iterator(None))
        vocab_size = sum(1 for _ in termsref)


    # print str(vocab_size) # size of vocabulary
    # print str(vocabulary.getDocCount()) # #docs that have at least one term for title field
    # print str(vocabulary.getSumTotalTermFreq()) # #tokens
    # print str(vocabulary.getSumDocFreq()) # #postings

    reader.close()
    return duration, vocab_size
예제 #12
0
 def __init__(self,
              lucene_index_dir='lucene_index/',
              num_docs_to_return=1000):
     lucene.initVM(vmargs=['-Djava.awt.headless=true'])
     directory = SimpleFSDirectory(File(lucene_index_dir))
     self.searcher = IndexSearcher(DirectoryReader.open(directory))
     self.num_docs_to_return = num_docs_to_return
     self.ireader = IndexReader.open(directory)
예제 #13
0
 def __init__(self, path):
     print "Loading data.json..."
     with open(path, "r") as f:
         self.data = json.load(f)
     lucene.initVM()
     self.analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
     self.reader = IndexReader.open(SimpleFSDirectory(File("index/")))
     self.searcher = IndexSearcher(self.reader)
예제 #14
0
 def __init__(self, lucene_dir_path):
     if lucene_dir_path!=None and lucene_dir_path!='':
         lucene.initVM()
         directory = SimpleFSDirectory(File(lucene_dir_path))
         self.indexReader = IndexReader.open(directory)
         self.is_init=True
     else:
         self.is_init=False
예제 #15
0
파일: retriever.py 프로젝트: kevkid/YIF
def SearchQuery(queryString, fields, classification): 
    #if __name__ == "__main__":
    #if __name__ == "retriever":
    location = web.__path__[0] + "/static/web/files/index/index.articles"
    #lucene.initVM()
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
    reader = IndexReader.open(SimpleFSDirectory(File(location)))
    searcher = IndexSearcher(reader)
    #multi field query: http://stackoverflow.com/questions/2005084/how-to-specify-two-fields-in-lucene-queryparser
    
    query = MultiFieldQueryParser(Version.LUCENE_4_10_1, fields, analyzer)
    #query.setDefaultOperator(QueryParserBase.AND_OPERATOR)
    query = MultiFieldQueryParser.parse(query, queryString)
    #query.parse(queryString)#"Shigella sonnei"
    #query = QueryParser(Version.LUCENE_4_10_1, "abstract", analyzer).parse(queryString)#"Shigella sonnei"

    MAX = 10000
    hits = searcher.search(query, MAX)
 
    print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query)
    paths = []
    pmcids = []
    documentDict = {}
    for hit in hits.scoreDocs:
        doc = searcher.doc(hit.doc)
        pmcids.append(doc.get("pmcid"))
        docDict = {"title" : doc.get("title")}#we can add any other field we want...
        documentDict[doc.get("pmcid")] = docDict 
    
    #Where we get the images for all the pmcids    
    images = get_image_pmcid(pmcids, classification)#should take in pmcids and class
    #create dictionary of images with pmcid being their key
    imagesDict = {}
    for img in images:
        img_pmcid = img.get("pmcid") 
        if img_pmcid in imagesDict.keys():
            imagesDict[img_pmcid].append(img.get("filepath") + "/" + img.get("figureid"))
            
        else:
            imagesDict[img_pmcid] = [(img.get("filepath") + "/" + img.get("figureid"))]
            
    #for each pmcid, we will assign an image to it for the search results
    for pmcid in pmcids:
        if imagesDict:
            docDict = documentDict[pmcid]
            docDict["imgURL"] = imagesDict[pmcid][0] 
            documentDict[pmcid] = docDict 
        else:
            docDict = documentDict[pmcid]
            docDict["imgURL"] = "images/NoImageAvailable.jpg"
            documentDict[pmcid] = docDict
    
    #END - Where we get the images for all the pmcids
    
    
    return documentDict
예제 #16
0
 def __init__(
     self,
     lucene_index_dir='/dccstor/cssblr/amrita/dialog_qa/code/prepro_lucene/lucene_index/'
 ):
     lucene.initVM(vmargs=['-Djava.awt.headless=true'])
     directory = SimpleFSDirectory(File(lucene_index_dir))
     self.searcher = IndexSearcher(DirectoryReader.open(directory))
     self.num_docs_to_return = 5
     self.ireader = IndexReader.open(directory)
def lucene_retrieval_multifield(q_string, q_class, feature_type, use_BM25=False):
    """
    multifield: different query string for different field
    not same word on different field
    :param q_string:
    :param feature_type:
    :param use_BM25:
    :return: retrieval_scores for each question-answer pair
    """
    index = set_lucene_index['ind']  # nonlocal variable index

    def retrieval_scores(hists):
        """
        return sorted document+score by score
        :param hists:
        """
        def doc_score(hists):
            """
            return doc_name & score
            :param hists:
            """
            for h in hists:
                # docID = h.doc
                # doc = searcher.doc(docID)
                # file_name = doc.get("corpus_name")
                # doc_name = doc.get("doc_name")
                # text = doc.get("text")
                score = h.score
                # yield (file_name, doc_name, score, text)
                yield score
        doc_score_list = list(doc_score(hists))
        return map(lambda f: f(doc_score_list), feature_type)  # feature_type is a list of function

    text_query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string))
    subject_query = QueryParser(version, 'corpus_name', analyzer).parse(QueryParser.escape(q_class))
    query = BooleanQuery()

    # BooleanClause.Occur
    # MUST implies that the keyword must occur
    #  SHOULD implies that the keyword SHOULD occur
    query.add(text_query, BooleanClause.Occur.SHOULD)
    query.add(subject_query, BooleanClause.Occur.SHOULD)

    # search
    reader = IndexReader.open(index)
    searcher = IndexSearcher(reader)

    if use_BM25:
        searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75))  # todo: BM25 parameters

    collector = TopScoreDocCollector.create(hitsPerPage, True)
    searcher.search(query, collector)
    hs = collector.topDocs().scoreDocs  # hists

    results = retrieval_scores(hs)
    # reader.close()
    return results  # retrieval_scores for each question-answer pair
예제 #18
0
def search_results(search):
    results = []
    search_string = search.data['search']

    print "buscando:" + search_string

    lucene.initVM()
    analyzer = SpanishAnalyzer()
    reader = IndexReader.open(SimpleFSDirectory(File(indexDirectory)))
    searcher = IndexSearcher(reader)

    query = QueryParser("text", analyzer).parse(search_string)
    MAX = 1000
    hits = searcher.search(query, MAX)

    if not hits.totalHits:
        flash('No results found!')
        return redirect('/')
    else:

        # print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query)
        flash("Found %d document(s) that matched query '%s':" %
              (hits.totalHits, query))
        # render_template('index.html', form=search)
        items = []
        for hit in hits.scoreDocs:
            # print hit.score, hit.doc, hit.toString()
            if len(items) > 10:
                flash('Returning only first 10 results')
                break

            doc = searcher.doc(hit.doc)
            items.append(
                Row(
                    hit.score,
                    doc.get("tipo_sesion"),
                    doc.get("organo"),
                    doc.get("presidente"),
                    doc.get("dia") + "/" + doc.get("mes") + "/" +
                    doc.get("anio"),
                    doc.get("tipo_epigrafe"),
                    doc.get("text").replace(
                        search_string, '<span class="highlightme">' +
                        search_string + '</span>'),
                    # doc.get("filename") # + "<a href='#'>hola"
                    '<a href="' + doc.get("filename") + '">' +
                    doc.get("filename") + '</a>'))
        #     print hit.score, hit.doc, hit.toString()
        #     doc = searcher.doc(hit.doc)
        #     # print doc.get("text").encode("utf-8")
        #     print

        # display results
        return render_template('index.html',
                               form=search,
                               items=items,
                               search_string=search_string)
예제 #19
0
 def load_index(self):
     indexDir = File(self.index_path)
     a = {"code": self.porter_analyzer}
     self.analyzer = PerFieldAnalyzerWrapper(KeywordAnalyzer(), a)
     index = SimpleFSDirectory(indexDir)
     self.reader = IndexReader.open(index)
     n_docs = self.reader.numDocs()
     self.searcher = IndexSearcher(self.reader)
     print("Index contains %d documents." % n_docs)
예제 #20
0
def lucene_retrieval(q_string, feature_type, use_BM25=False):
    """

    :param q_string:
    :param feature_type:
    :param use_BM25:
    :return: retrieval_scores for each question-answer pair
    """
    index = set_lucene_index['ind']  # nonlocal variable index

    def retrieval_scores(hists):
        """
        return sorted document+score by score
        :param hists:
        """
        def doc_score(hists):
            """
            return doc_name & score
            :param hists:
            """
            for h in hists:
                # docID = h.doc
                # doc = searcher.doc(docID)
                # file_name = doc.get("corpus_name")
                # doc_name = doc.get("doc_name")
                # text = doc.get("text")
                score = h.score
                # yield (file_name, doc_name, score, text)
                yield score

        doc_score_list = list(doc_score(hists))
        return map(lambda f: f(doc_score_list),
                   feature_type) if len(doc_score_list) != 0 else [0] * len(
                       feature_type)  # feature_type is a list of function

    # escape special characters via escape function
    query = QueryParser(version, 'text',
                        analyzer).parse(QueryParser.escape(q_string))

    # search
    reader = IndexReader.open(index)
    searcher = IndexSearcher(reader)

    if use_BM25:
        searcher.setSimilarity(BM25Similarity(k1=1.5,
                                              b=0.75))  # todo: BM25 parameters

    collector = TopScoreDocCollector.create(hitsPerPage, True)
    searcher.search(query, collector)
    hs = collector.topDocs().scoreDocs  # hists

    results = retrieval_scores(hs)
    # reader.close()
    return results  # retrieval_scores for each question-answer pair
예제 #21
0
    def xmlrpc_getStatistics(self, instance):
        reader = IndexReader.open(self.indexPath)

        filter = RangeFilter('instance', instance, instance, 1, 1)

        num = filter.bits(reader).cardinality()

        stat = Vector()
        stat.add(num)
        stat.add(0)#len(index.terms()))
        reader.close()
        return stat
예제 #22
0
def get_wiki_docids(data_file, wikipedia_index):
    from questions import get_input_data
    data = get_input_data(data_file)

    lucene.initVM()
    stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
    for s in stopwords:
        stops.add(s)
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
    reader = IndexReader.open(SimpleFSDirectory(File(wikipedia_index)))
    searcher = IndexSearcher(reader)

    generate_docids(data, data_file, analyzer, searcher)
예제 #23
0
 def __init__(self):
     #self.segmentor.load('./cws.model')
     INDEXDIR = './Myindex'
     #lucene.initVM(vmargs='-Xcheck:jni,-verbose:jni,-verbose:gc')
     lucene.initVM(vmargs=['-Djava.awt.headless=true'])
     #vm_env = lucene.getVMEnv()
     #vm_env.attachCurrentThread()
     #lucene.initVM(vmargs='-')
     #print 'lucene', lucene.VERSION
     self.directory = SimpleFSDirectory(File(INDEXDIR))
     self.searcher = IndexSearcher(DirectoryReader.open(self.directory))
     self.analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
     self.reader = IndexReader.open(self.directory)
예제 #24
0
def group_tests():

    TP = 0.0
    FN = 0.0
    n = 0.0
    precision = 0
    recall = 0

    lucene.initVM()
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
    reader = IndexReader.open(SimpleFSDirectory(File("./articleTitleIndex/")))
    searcher = IndexSearcher(reader)
    with open('Labeled800Queries/labeler3.txt', 'r') as f:
        for line in f:
            n += 1
            line = line.split('\t')
            user_query = line[0]
            labels = line[1:]
            user_query = re.sub('[^0-9a-zA-Z]+', ' ', user_query)
            print user_query
            print labels
            res =  predict(user_query, analyzer, reader, searcher, test = "group")

            converted_res = []
            for label in res:
                #print label[0]
                converted_res.append(cvt.WikiToKDD[label[0].replace('_', ' ')])

            if not res:
                print "empty goal category set"
            print converted_res

            """ compare labels and converted_res """
            for label in labels:
                label = label.replace('\r', '')
                label = label.replace('\n', '')
                if label not in cvt.WikiToKDD.values():
                    continue
                #print label
                if label in converted_res:
                    TP += 1.0
                else:
                    FN += 1.0
            
            print "=========================================================="

    precision = TP/(SIZE*n)
    recall = TP/(TP+FN)

    print "precision:", precision
    print "recall:", recall
예제 #25
0
def get_wiki_docids(data_file, wikipedia_index):
    from questions import get_input_data

    data = get_input_data(data_file)

    lucene.initVM()
    stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
    for s in stopwords:
        stops.add(s)
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
    reader = IndexReader.open(SimpleFSDirectory(File(wikipedia_index)))
    searcher = IndexSearcher(reader)

    generate_docids(data, data_file, analyzer, searcher)
def lucene_retrieval(q_string, feature_type, use_BM25=False):
    """

    :param q_string:
    :param feature_type:
    :param use_BM25:
    :return: retrieval_scores for each question-answer pair
    """
    index = set_lucene_index['ind']  # nonlocal variable index

    def retrieval_scores(hists):
        """
        return sorted document+score by score
        :param hists:
        """
        def doc_score(hists):
            """
            return doc_name & score
            :param hists:
            """
            for h in hists:
                # docID = h.doc
                # doc = searcher.doc(docID)
                # file_name = doc.get("corpus_name")
                # doc_name = doc.get("doc_name")
                # text = doc.get("text")
                score = h.score
                # yield (file_name, doc_name, score, text)
                yield score
        doc_score_list = list(doc_score(hists))
        return map(lambda f: f(doc_score_list), feature_type) if len(doc_score_list) != 0 else [0]*len(feature_type) # feature_type is a list of function

    # escape special characters via escape function
    query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string))

    # search
    reader = IndexReader.open(index)
    searcher = IndexSearcher(reader)

    if use_BM25:
        searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75))  # todo: BM25 parameters

    collector = TopScoreDocCollector.create(hitsPerPage, True)
    searcher.search(query, collector)
    hs = collector.topDocs().scoreDocs  # hists

    results = retrieval_scores(hs)
    # reader.close()
    return results  # retrieval_scores for each question-answer pair
예제 #27
0
 def find(self, query):
     transformer = StringTransformer()
     analyzer = EnglishAnalyzer(Version.LUCENE_CURRENT)
     reader = IndexReader.open(SimpleFSDirectory(File("index/")))
     searcher = IndexSearcher(reader)
     searcher.setSimilarity(BM25Similarity())
     processed_query = ' '.join(
         self._preprocessor(transformer.transform(query)))
     query = QueryParser(Version.LUCENE_CURRENT, "content",
                         analyzer).parse(processed_query)
     hits = searcher.get_description(query, 10)
     result_list = []
     for hit in hits.scoreDocs:
         doc = searcher.doc(hit.doc)
         result_list.append(doc.get("path").encode("utf-8"))
     return result_list
예제 #28
0
def make_request(query, analyzer, index, qparser_regexp=None, max_results=100):
    """ Returns list of Score objects, which exposes `doc` (document ID in lucene) and `score` attrs
    :param query:
    :param analyzer:
    :param index:
    :param qparser_regexp:
    :param max_results:
    :return:
    """
    reader = IndexReader.open(SimpleFSDirectory(File(index)))
    searcher = IndexSearcher(reader)

    query = QueryParser(Version.LUCENE_30, "content", analyzer).parse(
        query if not qparser_regexp else re.sub(qparser_regexp, " ", query))
    hits = searcher.search(query, max_results)
    return hits.scoreDocs
예제 #29
0
def make_request(query, analyzer, index, qparser_regexp=None, max_results=100):
    """
    :param query:
    :param analyzer:
    :param index:
    :param qparser_regexp:
    :param max_results:
    :return:
    """
    reader = IndexReader.open(SimpleFSDirectory(File(index)))
    searcher = IndexSearcher(reader)

    query = QueryParser(Version.LUCENE_30, "content", analyzer).parse(
        query if not qparser_regexp else re.sub(qparser_regexp, " ", query))
    hits = searcher.search(query, max_results)
    return hits.scoreDocs
def delete(primary_keys_map,collection_name,todelete,commit=False):
	INDEX_DIR_DEFAULT="IndexFiles.index"
	if collection_name!="DEFAULT":
		INDEX_DIR=collection_name
	else:
		INDEX_DIR=INDEX_DIR_DEFAULT

	try:
		tofind_keyvalue_pairs=json.loads(todelete)
	except:
		return 100	
	

	direc=SimpleFSDirectory(File(INDEX_DIR))
	analyzer=StandardAnalyzer(Version.LUCENE_CURRENT)

	#setting writer configurations
	try:
		config=IndexWriterConfig(Version.LUCENE_CURRENT,analyzer)
		config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
		writer=IndexWriter(direc,config)
		ireader=IndexReader.open(direc)
	except:
		return 105

	###as of now deletion of documents support is only based on indexed keys.###################3 
	tofind_primary_keyvalue_pairs={}
	tofind_nonprimary_keyvalue_pairs={}

	#separating out primary and non_primary keys
	for key in tofind_keyvalue_pairs.keys():
		if key in primary_keys_map:
			tofind_primary_keyvalue_pairs[key]=tofind_keyvalue_pairs[key]
		else:
			tofind_nonprimary_keyvalue_pairs[key]=tofind_keyvalue_pairs[key]

	#filtering documents according to primary keys		
	query=BooleanQuery()
	for key in tofind_primary_keyvalue_pairs.keys():
		temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(tofind_primary_keyvalue_pairs[key])
		query.add(BooleanClause(temp,BooleanClause.Occur.MUST))

	a=writer.deleteDocuments(query)
	if commit==True:
		writer.commit()
	writer.close()
	return 000;
예제 #31
0
def delete(primary_keys_map, collection_name, todelete, commit=False):
    INDEX_DIR_DEFAULT = "IndexFiles.index"
    if collection_name != "DEFAULT":
        INDEX_DIR = collection_name
    else:
        INDEX_DIR = INDEX_DIR_DEFAULT

    try:
        tofind_keyvalue_pairs = json.loads(todelete)
    except:
        return 100

    direc = SimpleFSDirectory(File(INDEX_DIR))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    #setting writer configurations
    try:
        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
        writer = IndexWriter(direc, config)
        ireader = IndexReader.open(direc)
    except:
        return 105

    ###as of now deletion of documents support is only based on indexed keys.###################3
    tofind_primary_keyvalue_pairs = {}
    tofind_nonprimary_keyvalue_pairs = {}

    #separating out primary and non_primary keys
    for key in tofind_keyvalue_pairs.keys():
        if key in primary_keys_map:
            tofind_primary_keyvalue_pairs[key] = tofind_keyvalue_pairs[key]
        else:
            tofind_nonprimary_keyvalue_pairs[key] = tofind_keyvalue_pairs[key]

    #filtering documents according to primary keys
    query = BooleanQuery()
    for key in tofind_primary_keyvalue_pairs.keys():
        temp = QueryParser(Version.LUCENE_CURRENT, key,
                           analyzer).parse(tofind_primary_keyvalue_pairs[key])
        query.add(BooleanClause(temp, BooleanClause.Occur.MUST))

    a = writer.deleteDocuments(query)
    if commit == True:
        writer.commit()
    writer.close()
    return 000
예제 #32
0
def lucene_retrieval(q_string, use_BM25=False):
    """

    :param q_string:
    :param use_BM25:
    :return: retrieval_scores for each question-answer pair
    """
    index = set_lucene_index['ind']  # nonlocal variable index

    def doc_text(hists):
        """
        return doc_name & score
        :param hists:
        """
        text = '_NONE_'
        for h in hists:
            docID = h.doc
            doc = searcher.doc(docID)
            # file_name = doc.get("corpus_name")
            # doc_name = doc.get("doc_name")
            text = doc.get("text")
            #score = h.score
            # yield (file_name, doc_name, score, text)
        return text

    result = '_NONE_'

    # escape special characters via escape function
    if q_string and q_string.strip():   # when pre-process answers, `none of the above` -> '' cause error here
        #print(q_string)
        query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string))

        # search
        reader = IndexReader.open(index)
        searcher = IndexSearcher(reader)

        if use_BM25:
            searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75))  # todo: BM25 parameters

        collector = TopScoreDocCollector.create(hitsPerPage, True)
        searcher.search(query, collector)
        hs = collector.topDocs().scoreDocs  # hists
        result = doc_text(hs)

        # reader.close()
    return result  # text: also nodes
예제 #33
0
def make_request(query, analyzer, index, qparser_regexp=None, max_results=100):
    """
    :param query:
    :param analyzer:
    :param index:
    :param qparser_regexp:
    :param max_results:
    :return:
    """
    reader = IndexReader.open(SimpleFSDirectory(File(index)))
    searcher = IndexSearcher(reader)

    query = QueryParser(Version.LUCENE_30, "content", analyzer).parse(
        query if not qparser_regexp else re.sub(qparser_regexp, " ", query)
    )
    hits = searcher.search(query, max_results)
    return hits.scoreDocs
def number(collection_name):
	if collection_name!="DEFAULT":
		INDEX_DIR=collection_name
	else:
		INDEX_DIR=INDEX_DIR_DEFAULT
		
	direc=SimpleFSDirectory(File(INDEX_DIR))
  	analyzer=StandardAnalyzer(Version.LUCENE_CURRENT)
  	try:
  		ireader=IndexReader.open(direc)
  	except:
  		return 105
  	numdocs = int(ireader.numDocs())

  	ireader.close()
  	
  	return numdocs
예제 #35
0
def number(collection_name):
    if collection_name != "DEFAULT":
        INDEX_DIR = collection_name
    else:
        INDEX_DIR = INDEX_DIR_DEFAULT

    direc = SimpleFSDirectory(File(INDEX_DIR))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    try:
        ireader = IndexReader.open(direc)
    except:
        return 105
    numdocs = int(ireader.numDocs())

    ireader.close()

    return numdocs
예제 #36
0
def make_request(query, analyzer, index, qparser_regexp=None, max_results=100):
    """ Returns list of Score objects, which exposes `doc` (document ID in lucene) and `score` attrs
    :param query:
    :param analyzer:
    :param index:
    :param qparser_regexp:
    :param max_results:
    :return:
    """
    reader = IndexReader.open(SimpleFSDirectory(File(index)))
    searcher = IndexSearcher(reader)

    query = QueryParser(Version.LUCENE_30, "content", analyzer).parse(
        query if not qparser_regexp else re.sub(qparser_regexp, " ", query)
    )
    hits = searcher.search(query, max_results)
    return hits.scoreDocs
예제 #37
0
    def get_sorted_results(self, query):
        SHOULD = BooleanClause.Occur.SHOULD
        parsed_query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT, query, ['docno', 'content'], [SHOULD, SHOULD], self.analyzer)

        reader = IndexReader.open(self.directory)
        searcher = IndexSearcher(reader)

        searcher.setSimilarity(BM25Similarity())
        topDocs = searcher.search(parsed_query, 10)

        j = 0
        for i in topDocs.scoreDocs:
            d = searcher.doc(i.doc)

            print 'No. %02d: ' % (j + 1) + d['docno'] + ' ' + str(i.score)

            j += 1
예제 #38
0
파일: retriever.py 프로젝트: kevkid/YIF
def getDocumentPMC_ID(pmcid, imageAndTitle = 0):
    location = web.__path__[0] + "/static/web/files/index/index.articles"
    #lucene.initVM()
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
    reader = IndexReader.open(SimpleFSDirectory(File(location)))
    searcher = IndexSearcher(reader)
    #multi field query: http://stackoverflow.com/questions/2005084/how-to-specify-two-fields-in-lucene-queryparser
    query = QueryParser(Version.LUCENE_4_10_1, "pmcid", analyzer).parse(pmcid)#"Shigella sonnei"
    MAX = 1000
    hits = searcher.search(query, MAX)
    title = ""
    abstract = ""
    fullText = "http://www.ncbi.nlm.nih.gov/pmc/articles/" + pmcid + "/"
    doi = ""#need to split
    
    volume = ""
    year = ""
    publisher = ""
    for hit in hits.scoreDocs:#should only be one
        #print hit.score, hit.doc, hit.toString()
        doc = searcher.doc(hit.doc)
        if(imageAndTitle == 1):
            paths = []
            paths.append(doc.get("articlepath"))
            image = get_image(paths)
            
        abstract = doc.get("abstract")
        doi = doc.get("doi")
        title = doc.get("title")
        volume = doc.get("volume")
        year = doc.get("year")
        publisher = doc.get("publisher")
    if doi is not None:
        doiSecond = doi.split('/')
        doiSecond = doiSecond[1]#second part
    else:
        doiSecond = ""
    #http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3363814/pdf/cc11003.pdf
    pdf = "http://www.ncbi.nlm.nih.gov/pmc/articles/" + pmcid + "/pdf/" + doiSecond + ".pdf" 
    if(imageAndTitle == 1):
        return title, image, pmcid#image may sometimes show up    
    else:
        return abstract, doi, title, volume, year, publisher, fullText, pdf,pmcid#image may sometimes show up
예제 #39
0
def individual_test():

    user_query = "microsoft forms"
    lucene.initVM()
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
    reader = IndexReader.open(SimpleFSDirectory(File("./articleTitleIndex_withTitle/")))
    searcher = IndexSearcher(reader)
    res =  predict(user_query, analyzer, reader, searcher)
    print "goal_categories:"
    print res
    converted_res = []
    for label in res:
        #print label[0]
        converted_res.append(cvt.WikiToKDD[label[0].replace('_', ' ')])
    if not res:
        print "empty goal category set"
    print "converted goal_categories:"
    print converted_res
예제 #40
0
    def xmlrpc_unindexDocument(self, instance, id):
        """ Unindex document """
        filter = BooleanFilter()

        filter.add(FilterClause(RangeFilter('id', id, id, 1, 1),
                                BooleanClause.Occur.MUST))
        filter.add(FilterClause(RangeFilter('instance', instance, instance, 1, 1),
                                BooleanClause.Occur.MUST))
        
        reader = IndexReader.open(self.indexPath)

        bits = filter.bits(reader)

        docId = bits.nextSetBit(0)
        while docId >= 0:
            reader.deleteDocument(docId)
            docId = bits.nextSetBit(docId+1)

        reader.close()
예제 #41
0
def searchLucene(requestParameter):
    "this method is used to search Lucene"
    searchResults = []
    requestParameter = requestParameter.replace("/"," ")
    # 1. open the index
    if __name__ == "luceneSearch":
        lucene.initVM()
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
    index = SimpleFSDirectory(File("Home/WishMatcherIndex"))
    reader = IndexReader.open(index)
    n_docs = reader.numDocs()
    print("Index contains %d documents." % n_docs)

    # 2. parse the query from the command line
    fields=["AdLine","FieldString","FieldRelatedWords"]    
    parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields, analyzer)
    parser.setDefaultOperator(QueryParserBase.OR_OPERATOR)
    query = MultiFieldQueryParser.parse(parser,requestParameter)
    print(query)

    # 3. search the index for the query
    # We retrieve and sort all documents that match the query.
    # In a real application, use a TopScoreDocCollector to sort the hits.
    searcher = IndexSearcher(reader)
    hits = searcher.search(query, n_docs).scoreDocs

    # 4. display results
    print("Found %d hits:" % len(hits))
    for i, hit in enumerate(hits):
        doc = searcher.doc(hit.doc)
        product = doc.get("AdLine")
        url = doc.get("URL")
        if(doc.get("AdId") != 1200):
            product = product[:-1]
            url = url[:-1]
        print("%d. %s" % (i + 1, doc.get("AdLine")))
        r = result(str(product),str(url))
        searchResults.append(r)

    # 5. close resources
    #searcher.close()
    print(searchResults)
    return searchResults
예제 #42
0
def get_wiki_nums(data_file, wikipedia_index) :
	lucene.initVM()
	reader = IndexReader.open(SimpleFSDirectory(File(wikipedia_index)))
	searcher = IndexSearcher(reader)
	id_file = open(data_file + '.docid')
	num_file = open(data_file + '.nums', 'w')
	what = []
	for line in id_file :
		line = line.strip()
		if len(line) == 0 :
			continue
		line = line.split('\t')
		if len(line) == 2 and int(line[1]) not in [-1, 0, 1, 2, 3]:
			what.append(int(line[1]))

	what = list(set(what))

	for item in what :
		num_file.write(str(item) + '\t' + searcher.doc(item).get("num").encode('utf-8') + '\n')
예제 #43
0
    def load_index(self):
        indexDir = File(self.index_path)
        a = {
            "code": self.porter_analyzer,
            "description": self.porter_analyzer,
            "typed_method_call": KeywordAnalyzer(),
            "extends": KeywordAnalyzer(),
            "used_classes": KeywordAnalyzer(),
            "methods": KeywordAnalyzer(),
            "class_instance_creation": KeywordAnalyzer(),
            "id": KeywordAnalyzer()
        }
        self.analyzer = PerFieldAnalyzerWrapper(self.porter_analyzer, a)

        index = SimpleFSDirectory(indexDir)
        self.reader = IndexReader.open(index)
        n_docs = self.reader.numDocs()
        self.searcher = IndexSearcher(self.reader)
        print("\nLoading Indices... GitHub index contains [%d] documents." %
              n_docs)
예제 #44
0
    def load_index(self):
        indexDir = File(self.index_path)
        porter_analyzer = PorterAnalyzer(
            StandardAnalyzer(Version.LUCENE_CURRENT))
        a = {
            "typed_method_call": KeywordAnalyzer(),
            "extends": KeywordAnalyzer(),
            "used_classes": KeywordAnalyzer(),
            "methods": KeywordAnalyzer(),
            "class_instance_creation": KeywordAnalyzer(),
            "id": KeywordAnalyzer(),
            "code": JavaCodeAnalyzer()
        }

        self.analyzer = PerFieldAnalyzerWrapper(porter_analyzer, a)
        index = SimpleFSDirectory(indexDir)
        self.reader = IndexReader.open(
            index)  #IndexReader 열고 닫지 않았었음...........................
        n_docs = self.reader.numDocs()
        print("Index contains %d documents." % n_docs)
예제 #45
0
    def get_sorted_results(self, query):
        SHOULD = BooleanClause.Occur.SHOULD
        parsed_query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT,
                                                   query, ['docno', 'content'],
                                                   [SHOULD, SHOULD],
                                                   self.analyzer)

        reader = IndexReader.open(self.directory)
        searcher = IndexSearcher(reader)

        searcher.setSimilarity(BM25Similarity())
        topDocs = searcher.search(parsed_query, 10)

        j = 0
        for i in topDocs.scoreDocs:
            d = searcher.doc(i.doc)

            print 'No. %02d: ' % (j + 1) + d['docno'] + ' ' + str(i.score)

            j += 1
예제 #46
0
파일: retriever.py 프로젝트: kevkid/YIF
def getRandomDoc():
    
        location = web.__path__[0] + "/static/web/files/index/index.figures"
        #lucene.initVM()
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()
        analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
        reader = IndexReader.open(SimpleFSDirectory(File(location)))
        searcher = IndexSearcher(reader)
     
        #query = QueryParser(Version.LUCENE_4_10_1, "keywords", analyzer).parse(queryString)#"Shigella sonnei"
        MAX = 1000
        docNum = randrange(0, reader.maxDoc())
        doc = reader.document(docNum)
        
        fileName = doc.get("filename")
        filePath = doc.get("filepath")
            
        result = filePath + "/" + fileName 
        result = result.replace("/home/kevin/Downloads/","/")
        return (result, docNum)
예제 #47
0
def getSimilarityGenerator(field,minTermFreq,minDocFreq,minWordLen): # maxQueryTerms as parameter
    maxQueryTerms = 30
    location = web.__path__[0] + "/static/web/files/index/index.articles"
    #lucene.initVM()
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    reader = IndexReader.open(SimpleFSDirectory(File(location)))
    
    simil = MoreLikeThis(reader)
    simil.setFieldNames(field)
    simil.setMinTermFreq(minTermFreq)
    simil.setMinDocFreq(minDocFreq)
    simil.setMinWordLen(minWordLen)

    #Use same maxQueryTerms to prevent longer queries
    simil.setMaxQueryTerms(maxQueryTerms)

    simil.setBoost(True) #!Boost terms within queries by tf-idf score
    return simil
    
    
예제 #48
0
def evaluate_index(data_dir, store_dir, analyzer):
    """
    Evaluates vocabulary size and indexing speed for different
    analyzer configurations.
    """
    start = time.clock()
    Indexer(data_dir, store_dir, analyzer)
    end = time.clock()
    duration = end-start

    directory = SimpleFSDirectory(File(store_dir))
    reader = IndexReader.open(directory)
    vocabulary = MultiFields.getTerms(reader, 'title')
    vocab_size = vocabulary.size()

    # sometimes .size() doesn't return the correct size, in this case
    # we have to count manually
    if vocab_size == -1:
        termsref = BytesRefIterator.cast_(vocabulary.iterator(None))
        vocab_size = sum(1 for _ in termsref)

    reader.close()
    return duration, vocab_size
예제 #49
0
파일: retriever.py 프로젝트: kevkid/YIF
def getDocumentClass(reqClass):
    import random
    location = web.__path__[0] + "/static/web/files/index/index.figures"
    #lucene.initVM()
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
    reader = IndexReader.open(SimpleFSDirectory(File(location)))
    searcher = IndexSearcher(reader)
    #multi field query: http://stackoverflow.com/questions/2005084/how-to-specify-two-fields-in-lucene-queryparser
    query = QueryParser(Version.LUCENE_4_10_1, "class", analyzer).parse(str(random.choice(reqClass)))#get random class and search for a document from here
    MAX = 1000
    hits = searcher.search(query, MAX)
    docs = []
    for hit in hits.scoreDocs:#should only be one
        #print hit.score, hit.doc, hit.toString()
        docs.append((searcher.doc(hit.doc), hit.doc))
    
    if not docs:
        return -1
    else:
        doc = random.choice(docs)
        return doc#return document and ID
예제 #50
0
파일: common.py 프로젝트: zoubiao1/pylucene
 def __init__(self, indexDir,
              computeLengthNorm=True):  # 初始化 indexDir-索引文件目录 computeLengthNorm-是否应用SIM(true-不用 false-应用)
     #         if not jpype.isJVMStarted():
     #         lucene.initVM()
     lucene.getVMEnv().attachCurrentThread()
     self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)  # 标准分词 在针对英文时 以分隔符分词
     self.path = os.path.join(INDEX_PATH, indexDir)  # 存储路径
     self.store = SimpleFSDirectory(File(self.path))  # 存储***?
     # self.reader = DirectoryReader.open(self.store)
     self.reader = IndexReader.open(self.store)
     self.numDocs = self.reader.maxDoc()
     self.searcher = IndexSearcher(self.reader)  # IndexSearch类
     sim = CustomSimilarity()  # addby zmq
     if not computeLengthNorm:  # SIM
         sim = CustomSimilarity()
         self.searcher.setSimilarity(sim)
     self.mlt = MoreLikeThis(self.reader, sim)  # mlt?
     self.mlt.setAnalyzer(self.analyzer)
     self.mlt.setMinTermFreq(1)
     self.mlt.setMinDocFreq(1)
     # debug
     self.mlt.setMinWordLen(1)
     self.mlt.setMaxNumTokensParsed(100000000)
     BooleanQuery.setMaxClauseCount(1024 * 1024)  # 修改最长query clause BUG
예제 #51
0
lucene.initVM()

# ANALYZER
analyzer = StandardAnalyzer(util.Version.LUCENE_CURRENT)

# DIRECTORY
directory = SimpleFSDirectory(File(luceneIndexPath))

#dont forget to remove the luceneIndexDirectory file everytime you run this code.
'''
# INDEX WRITER
code removed
'''
# INDEX READER
reader = IndexReader.open(directory)
searcher = IndexSearcher(reader)

# QUERYING FOR A QUESTION
queryParser = QueryParser(util.Version.LUCENE_CURRENT, "text", analyzer)

###################### end

###################### creating vectors (infering) for testing

#inference hyper-parameters
tic = time.time()
start_alpha = 0.01
infer_epoch = 1000
testingFilePath = '/home/tarun/PE/testFiles/testNum18000.csv'
예제 #52
0
def predict_test(indexed_data, index_destination, source='directory', already_indexed=False):
    """
    :param indexed_data_dir:
    :param index_destination:
    :return:
    """
    def choose_best():
        scores = []
        for k, v in sorted(res.items(), key=lambda x: x[0]):
            scores.append((k, 1. * sum(data_test['correctAnswer'] == v) / len(v)))
        return sorted(scores, key=lambda x: -x[-1])[0][0]
 
    def calculate_score(res):
        """
        :param res:
        :return:
        """
        correct = 0
        total = 0
        for index, row in data_test.iterrows():
            if res[index] == row['correctAnswer']:
                correct += 1
            total += 1
        return float(correct)/total
 
    if not already_indexed:
        make_index(indexed_data, index_destination, source)
 
    res = {}
    MAX = 100
    docs_per_q = range(1,20)

    records = []
 
    #analyzer = StandardAnalyzer(Version.LUCENE_30)
    analyzer = SnowballAnalyzer(Version.LUCENE_30, "English", StandardAnalyzer.STOP_WORDS_SET)
    reader = IndexReader.open(SimpleFSDirectory(File(index_destination)))
    searcher = IndexSearcher(reader)
 
    for index, row in data_test.iterrows():
 
        queries = [row['answerA'], row['answerB'], row['answerC'], row['answerD']]
        queries = [row['question'] + ' ' + q for q in queries]
 
        scores = {}
        for q in queries:
            query = QueryParser(Version.LUCENE_30, "content", analyzer).parse(re.sub("[^a-zA-Z0-9]"," ", q))
            #query = QueryParser(Version.LUCENE_30, "content", analyzer).parse(re.sub("[/^]", "\^", q))
            hits = searcher.search(query, MAX)
            doc_importance = [hit.score for hit in hits.scoreDocs]
            for n in docs_per_q:
                scores.setdefault(n, [])
                scores[n].append(sum(doc_importance[:n]))

        to_records = [index+102501]
        to_records.append(['A','B','C','D'][np.argmax(scores[4])])
        records.append(to_records)

        for n in docs_per_q:
            res.setdefault(n, [])
            res[n].append(['A','B','C','D'][np.argmax(scores[n])])

    df = pandas.DataFrame.from_records(records, columns=["id","correctAnswer"])
    df = df.set_index("id")
    df.to_csv("ololo.csv")

    # print res[4]
    best = choose_best()
    print best
    score = calculate_score(res[best])
    # score = calculate_score(res)
    print score
예제 #53
0
 def __init__(self, lucene_index_dir='/data/zjy/csqa_data/lucene_dir/'):
     lucene.initVM(vmargs=['-Djava.awt.headless=true'])
     directory = SimpleFSDirectory(File(lucene_index_dir))
     self.searcher = IndexSearcher(DirectoryReader.open(directory))
     self.num_docs_to_return = 5
     self.ireader = IndexReader.open(directory)
예제 #54
0
def lucene_retrieval_multifield(q_string,
                                q_class,
                                feature_type,
                                use_BM25=False):
    """
    multifield: different query string for different field
    not same word on different field
    :param q_string:
    :param feature_type:
    :param use_BM25:
    :return: retrieval_scores for each question-answer pair
    """
    index = set_lucene_index['ind']  # nonlocal variable index

    def retrieval_scores(hists):
        """
        return sorted document+score by score
        :param hists:
        """
        def doc_score(hists):
            """
            return doc_name & score
            :param hists:
            """
            for h in hists:
                # docID = h.doc
                # doc = searcher.doc(docID)
                # file_name = doc.get("corpus_name")
                # doc_name = doc.get("doc_name")
                # text = doc.get("text")
                score = h.score
                # yield (file_name, doc_name, score, text)
                yield score

        doc_score_list = list(doc_score(hists))
        return map(lambda f: f(doc_score_list),
                   feature_type)  # feature_type is a list of function

    text_query = QueryParser(version, 'text',
                             analyzer).parse(QueryParser.escape(q_string))
    subject_query = QueryParser(version, 'corpus_name',
                                analyzer).parse(QueryParser.escape(q_class))
    query = BooleanQuery()

    # BooleanClause.Occur
    # MUST implies that the keyword must occur
    #  SHOULD implies that the keyword SHOULD occur
    query.add(text_query, BooleanClause.Occur.SHOULD)
    query.add(subject_query, BooleanClause.Occur.SHOULD)

    # search
    reader = IndexReader.open(index)
    searcher = IndexSearcher(reader)

    if use_BM25:
        searcher.setSimilarity(BM25Similarity(k1=1.5,
                                              b=0.75))  # todo: BM25 parameters

    collector = TopScoreDocCollector.create(hitsPerPage, True)
    searcher.search(query, collector)
    hs = collector.topDocs().scoreDocs  # hists

    results = retrieval_scores(hs)
    # reader.close()
    return results  # retrieval_scores for each question-answer pair
예제 #55
0
 def open(self):
     debug('Opening index "%s"' % self.indexPath)
     self.reader = IndexReader.open(self.indexDir)
     self.searcher = IndexSearcher(self.reader)
     self.totalDocs = self.getTotalSentenceCount()