for fileName in os.listdir(corpus):
	#print fileName
	document = Document()
	article = os.path.join(corpus, fileName)
	content = open(article, 'r').read()
	document.add(Field("text", content, Field.Store.YES, Field.Index.ANALYZED))
	writer.addDocument(document)
print writer.numDocs()
writer.close()

# INDEX READER
reader = IndexReader.open(directory)
searcher = IndexSearcher(reader)

# QUERYING FOR A QUESTION
queryParser = QueryParser(util.Version.LUCENE_CURRENT, "text", analyzer)

'''
answers = ['A', 'B', 'C', 'D']
submissionFile = open("luceneModel.csv", "w")
writer = csv.writer(submissionFile, delimiter=',')
writer.writerow(['id', 'correctAnswer'])


# 10 - 0.3844
# 9 - 0.386
# 5 - 0.3742

with open(trainingFilePath) as trainData:
	reader = csv.reader(trainData, delimiter="\t")
	header=0
示例#2
0
    return ''


if not (TAGS_AND_GENRES or DESCR):
    raise Exception(
        'At least one between TAGS_AND_GENRES and DESCR should be True')

lucene.initVM(vmargs=['-Djava.awt.headless=true'])
fsDir = SimpleFSDirectory(Paths.get('index'))
searcher = IndexSearcher(DirectoryReader.open(fsDir))

if CLASSIC_SIMILARITY:
    searcher.setSimilarity(ClassicSimilarity())

analyzer = EnglishAnalyzer()
tags_parser = QueryParser(TAGS_LABEL, analyzer)
genres_parser = QueryParser(GENRES_LABEL, analyzer)
descr_parser = QueryParser(DESCR_LABEL, analyzer)

tags_parser.setDefaultOperator(QueryParser.Operator.OR)
genres_parser.setDefaultOperator(QueryParser.Operator.OR)
descr_parser.setDefaultOperator(QueryParser.Operator.OR)

BooleanQuery.setMaxClauseCount(
    2000000)  # prevents 1024 limit error for very long queries

############################## Build user queries ##########################
ratings = ML1M('../datasets/ml-1m').ratings

movies_descriptions = pd.read_csv('../datasets/movies-descriptions.csv')
movies_tags = pd.read_csv('../datasets/movies-tags.csv')
示例#3
0
    print("lm docs not in qrels: %s" % (len(lm_docs)))
    f = codecs.open('/home/fernando/MatchZoo/data/robust04/corpus_n_stem2.txt',
                    'w',
                    encoding='utf8')
    for did in lm_docs:
        f.write("%s %s\n" % (did, lm_docs[did]))
    f.close()


if __name__ == "__main__":
    lucene.initVM()
    index = DirectoryReader.open(
        SimpleFSDirectory(Paths.get(INDEX_BASE_DIR + INDEX_DIR)))
    searcher = IndexSearcher(index)
    analyzer = EnglishAnalyzer()
    qparser = QueryParser("contents", analyzer)

    qid_doc_list = {}
    qrel_dict = {}
    qrel_docs = set()

    rel_file = '/home/fernando/MatchZoo/data/robust04/cv_splits/test.5.txt'
    rel = read_relation(filename=rel_file)
    #rel.extend(read_relation(filename='/home/fernando/MatchZoo/data/robust04/relation_train.txt'))
    #rel.extend(read_relation(filename='/home/fernando/MatchZoo/data/robust04/relation_valid.txt'))
    print('Instance size: %s' % (len(rel)), end='\n')
    word_dict, _ = read_word_dict(
        "/home/fernando/MatchZoo/data/robust04/word_dict_new_n_stem_filtered_rob04_embed.txt"
    )

    for label, d1, d2 in rel:
示例#4
0
 def perfume_search(command):
     query = QueryParser(Version.LUCENE_CURRENT, "name",
                         analyzer).parse(command)
     return query
示例#5
0
    print("Loading Lucene Index ...")
    lucene.initVM(vmargs=['-Djava.aws.headless=true'])
    analyzer = StandardAnalyzer()
    searchDir = NIOFSDirectory(Paths.get(args.index_path))
    searcher = IndexSearcher(DirectoryReader.open(searchDir))

    # try tuning the hyperparameters of bm25
    for k1 in [0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2]:
        for b in [0.5, 0.6, 0.7, 0.8, 0.9]:

            print(f"Grid search.... k1: {k1}; b: {b}")

            searcher.setSimilarity(BM25Similarity(k1, b))

            parser = QueryParser('Context', analyzer)

            retrieved = []
            print("Searching ...")
            for q in tqdm(questions):
                query = parser.parse(QueryParser.escape(q))
                # print(q, "|", QueryParser.escape(q), "|", query)
                # import pdb; pdb.set_trace()
                scoreDocs = searcher.search(query, args.topk).scoreDocs
                topkDocs = []
                for hit in scoreDocs:
                    doc = searcher.doc(hit.doc)
                    topkDocs.append({
                        "title": doc.get("Title"),
                        "text": doc.get("Context")
                    })
示例#6
0
    def find_documents(self, search_text):
        self.query = QueryParser("contents", self.analyzer).parse(search_text)
        self.hits = self.searcher.search(self.query, 50)

        return self.hits
示例#7
0
 def mid_search(mid):
     query = QueryParser(Version.LUCENE_CURRENT, "mid",
                         analyzer).parse(command)
     return query
示例#8
0
def run(searcher_good, searcher_bad, analyzer):
    while True:
        command_dict = parseCommand(command)
        total_num = 20

        #这些不同的s用来决定排序顺序:依次是按价格(从低到高)、热度(总评论数)、好评率、综合评分
        #s=SortField("price",SortField.Type.FLOAT,False)
        #s=SortField("total_comment",SortField.Type.FLOAT,True)
        s = SortField("good_rate", SortField.Type.FLOAT, True)
        #s=SortField("socre",SortField.Type.FLOAT,True)
        so = Sort(s)

        querys = BooleanQuery()
        for k, v in command_dict.iteritems():
            query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
            querys.add(query, BooleanClause.Occur.MUST)

        #这两句用来限定价格的范围
#q=NumericRangeQuery.newFloatRange("price",100.0,200.0,True,True)
#querys.add(q,BooleanClause.Occur.MUST)

        scoreDocs_good = searcher_good.search(querys, total_num, so).scoreDocs
        total = len(scoreDocs_good)
        flag = True
        if len(scoreDocs_good) < total_num:
            scoreDocs_bad = searcher_bad.search(querys, total_num,
                                                so).scoreDocs
            total = total + len(scoreDocs_bad)
            flag = False
        if total > total_num:
            total = total_num
        print "%s total matching documents." % total

        #"url"是网址,“img_url”是图片网址,“brand”是品牌
        for scoreDoc_good in scoreDocs_good:
            doc = searcher_good.doc(scoreDoc_good.doc)
            ##            explanation = searcher.explain(query, scoreDoc.doc)
            print "------------------------"
            print 'title:', doc.get('title')
            print 'total_comment', doc.get("total_comment")
            print 'price', doc.get("price")
            print 'socre', doc.get("socre")
            print 'brand', doc.get("brand")
            print 'good_rate', doc.get("good_rate")
            print
        if not flag:
            t = 0
            for scoreDoc_bad in scoreDocs_bad:
                t = t + 1
                doc = searcher_bad.doc(scoreDoc_bad.doc)
                ##                explanation = searcher.explain(query, scoreDoc.doc)
                print "------------------------"
                print 'title:', doc.get('title')
                print 'total_comment', doc.get("total_comment")
                print 'price', doc.get("price")
                print 'score', doc.get("score")
                print 'brand', doc.get("brand")
                print 'good_rate', doc.get("good_rate")
                print
                if t > total_num - 1 - len(scoreDocs_good):
                    break
示例#9
0
def runstext(command, cpage, meth):

    global vm_env, searcher, analyzer
    text = []
    print(command)
    if command == '':
        return

    command_dict = parseCommand(command)
    querys = BooleanQuery()

    for k, v in command_dict.iteritems():
        query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
        querys.add(query, BooleanClause.Occur.MUST)
    scoreDocs = searcher.search(querys, 1000).scoreDocs
    maxnum = len(scoreDocs)

    keywords = QueryParser(Version.LUCENE_CURRENT, "contents",
                           analyzer).parse(command_dict['contents'])
    reslist = []
    maxnum = min(maxnum, 100)
    for i, scoreDoc in enumerate(scoreDocs[:maxnum]):
        doc = searcher.doc(scoreDoc.doc)
        date = doc.get("date")
        score = float(scoreDoc.score)
        reslist.append([doc, date, score])

    style = highlight.SimpleHTMLFormatter("<b><font color=\'red\'>",
                                          "</font></b>")
    high_seg = highlight.Highlighter(style, highlight.QueryScorer(keywords))
    high_seg.setTextFragmenter(highlight.SimpleFragmenter(50))

    if meth == "rel":
        reslist = sorted(reslist, key=lambda res: res[2], reverse=True)
    elif meth == "td":
        reslist = sorted(reslist, key=lambda res: res[1], reverse=True)
    elif meth == "tu":
        reslist = sorted(reslist, key=lambda res: res[1], reverse=False)
    print keywords
    start = (cpage - 1) * 10
    end = min(start + 10, maxnum)
    print start, end
    for i in reslist[start:end]:
        doc = i[0]
        score = i[2]
        date = str(getdate(i[1]))
        text_dic = {}
        text_dic['title'] = doc.get("title").strip('-直播吧zhibo8.cc').strip(
            '_新浪竞技风暴_新浪网')
        text_dic['url'] = doc.get("url")
        tmpcontent = cleantxt(doc.get("contents"))
        keyword = high_seg.getBestFragment(analyzer, "contents", tmpcontent)
        text_dic['keyword'] = keyword
        text_dic['score'] = score
        text_dic['date'] = date
        text.append(text_dic)
    '''for i, scoreDoc in enumerate(scoreDocs):
        text_dic = {}
        doc = searcher.doc(scoreDoc.doc)

        text_dic['title'] = doc.get("title")
        text_dic['url'] = doc.get("url")
        keyword = high_seg.getBestFragment(analyzer, "contents", cleantxt(doc.get('contents')))
        text_dic['keyword'] = keyword
        text.append(text_dic)'''

    return text, maxnum
示例#10
0
    def ancientSearch(self, field):
        sear = self._search
        fieldOnly = False
        # 只搜索域
        if len(self._commandInfo.getWordList()) == 0:
            fieldOnly = True
            bq = BooleanQuery.Builder()
            fields = self._commandInfo.getFields()
            for key in fields:
                queryx = QueryParser(key, KeywordAnalyzer()).parse(fields[key][0])
                bc = BooleanClause(queryx, BooleanClause.Occur.MUST)
                bq.add(bc)
            query = bq.build()

        elif len(self._commandInfo.getKey()) == 0 or self._commandInfo.getKey()[0] in ['-', '~']:
            bq = BooleanQuery.Builder()
            q = QueryParser(field, StandardAnalyzer()).parse(make_parser(self._commandInfo.getWordList()[0]))
            bc = BooleanClause(q, BooleanClause.Occur.MUST)
            bq.add(bc)
            for i in self._commandInfo.getFields():
                if i == 'section' or i == 'document':
                    continue
                queryx = QueryParser(i, KeywordAnalyzer()).parse(make_ancient_parser(self._commandInfo.getFields()[i]))
                bc = BooleanClause(queryx, BooleanClause.Occur.MUST)
                bq.add(bc)
            query = bq.build()
        elif self._commandInfo.getKey()[0] == '#':
            bq = BooleanQuery.Builder()
            query1 = QueryParser(field, StandardAnalyzer()).parse(make_parser(self._commandInfo.getWordList()[0]))
            query2 = QueryParser(field, StandardAnalyzer()).parse(make_parser(self._commandInfo.getWordList()[1]))
            bc1 = BooleanClause(query1, BooleanClause.Occur.MUST)
            bc2 = BooleanClause(query2, BooleanClause.Occur.MUST)
            bq.add(bc1).add(bc2)
            for i in self._commandInfo.getFields():
                if i == 'section' or i == 'document':
                    continue
                queryx = QueryParser(i, KeywordAnalyzer()).parse(make_ancient_parser(self._commandInfo.getFields()[i]))
                bc = BooleanClause(queryx, BooleanClause.Occur.MUST)
                bq.add(bc)
            query = bq.build()
        elif self._commandInfo.getKey()[0] in ['$', '+']:
            bq = BooleanQuery.Builder()
            for w in self._commandInfo.getWordList():
                queryx = QueryParser(field, StandardAnalyzer()).parse(make_parser(w))
                bc = BooleanClause(queryx, BooleanClause.Occur.MUST)
                bq.add(bc)
            for i in self._commandInfo.getFields():
                if i == 'section' or i == 'document':
                    continue
                queryx = QueryParser(i, KeywordAnalyzer()).parse(make_ancient_parser(self._commandInfo.getFields()[i]))
                bc = BooleanClause(queryx, BooleanClause.Occur.MUST)
                bq.add(bc)
            query = bq.build()
        else:
            query = ''
        hits = sear.search(query, 9999)
        for hit in hits.scoreDocs:
            doc = sear.doc(hit.doc)
            res = doc.get(field)
            id = doc.get('id')
            detail = get_detail(doc)
            zhujie = detail['zhujie']
            if detail['detail'] and 'detail' in detail['detail'].keys():
                detail['detail'] = detail['detail']['detail']
            detail.pop('zhujie')
            detail.pop('text')
            detail.pop('type')
            detail = json.dumps(detail)
            if fieldOnly:
                if not doc.get("text").strip():
                    continue
                if id.count(".") == 2:
                    self._doc[id] = doc.get("text")
                    self._resultSentencesList.append((id, doc.get("text")))
                elif id.count(".") == 1:
                    searcher = self._search
                    query = QueryParser('id', KeywordAnalyzer()).parse(id + '.1')
                    hits = searcher.search(query, 1)

                    for hit in hits.scoreDocs:
                        doc = searcher.doc(hit.doc)
                        res = doc.get("text")
                        if res:
                            self._doc[id+".1"] = doc.get('text')
                            self._resultSentencesList.append((id + ".1", doc.get('text')))
                else:
                    searcher = self._search
                    query = QueryParser('id', KeywordAnalyzer()).parse(id + '.1.1')
                    hits = searcher.search(query, 1)
                    for hit in hits.scoreDocs:
                        doc = searcher.doc(hit.doc)
                        res = doc.get("text")
                        if not doc.get("text").strip():
                            continue
                        if res:
                            self._doc[id+".1.1"] = doc.get('text')
                            self._resultSentencesList.append((id + ".1.1", doc.get('text')))
            elif doc_hit(res, self._commandInfo):
                if key_filter(self._commandInfo, res):
                    if 'section' in self._commandInfo.getFields().keys():
                        if not search_upper_title_filter(id, sear, self._commandInfo.getFields()['section'], 0):
                            continue
                    if 'document' in self._commandInfo.getFields().keys():
                        if not search_upper_title_filter(id, sear, self._commandInfo.getFields()['document'], 1):
                            continue
                    self._doc[id] = res
                    self._resultSentencesList.append((id, res, detail, zhujie))
        return self
示例#11
0
def parseQuery(myQuery):
    parser = QueryParser("", StopAnalyzer())
    parsedQuery = parser.parse(myQuery)
    myQueryTerms = parsedQuery.toString().split(" ")
    return myQueryTerms
示例#12
0
from org.apache.lucene.search import PhraseQuery
from org.apache.lucene.search import BooleanQuery
from org.apache.lucene.search import BooleanClause
from org.apache.lucene.index import Term

from rlqa.retriever.lucene_analyzer import MySimpleAnalyzer as MySimpleAnalyzerPython

lucene.initVM(vmargs=['-Djava.awt.headless=true'])

# analyzer = MySimpleAnalyzer()
analyzer = MySimpleAnalyzerPython()
shingle_analyzer = ShingleAnalyzerWrapper(analyzer)
sentence = "Stearn received many honours for his work."
print(sentence)

query = QueryParser("text", analyzer).parse(QueryParser.escape(sentence))
print(query)


def parse_query(analyzer, query):
    ts = analyzer.tokenStream("dummy", StringReader(sentence))
    termAtt = ts.getAttribute(CharTermAttribute.class_)
    ts.reset()
    tokens = []
    while ts.incrementToken():
        tokens.append(termAtt.toString())
    ts.end()
    ts.close()

    booleanQuery = BooleanQuery.Builder()
    for token in tokens:
 def __init__(self, index_dir):
     self.index_dir = index_dir
     self.indexDir = SimpleFSDirectory(File(self.index_dir).toPath())
     self.q_parser = QueryParser("", WhitespaceAnalyzer())
     self.commit_max = 500000
     self.__get_writer_searcher()
示例#14
0
    def button_search_clicked(self):
        t1 = time.time()
        title_substring = self.ui.lineEdit_title.text()
        mode = self.ui.comboBox.currentText()

        # SQLite/PostgreSQL
        if not self.ui.radioButton_Lucene.isChecked():
            if mode == 'Полное совпадение':
                params = {'title': title_substring}
                query_string = """select * from movies where name ilike %(title)s"""

            if mode == 'Частичное совпадение':
                params = {'title': title_substring}
                query_string = """select * from movies where name ilike '%%'||%(title)s||'%%'"""

            if mode == 'Частичное совпадение по словам':
                params = {
                    'title' + str(i): v
                    for i, v in enumerate(title_substring.split())
                }
                query_string = """select * from movies where """ + ' or '.join(
                    [
                        """name ilike '%%'||%({})s||'%%'""".format(t)
                        for t in params.keys()
                    ])

            if mode == 'Полное совпадение + Год':
                year_substring = self.ui.lineEdit_year.text()
                year_substring = year_substring if year_substring != '' else None
                params = {'title': title_substring, 'year': year_substring}
                query_string = """select * from movies where name ilike %(title)s and year = %(year)s"""

            if mode == 'Частичное совпадение + Год':
                year_substring = self.ui.lineEdit_year.text()
                year_substring = year_substring if year_substring != '' else None
                params = {'title': title_substring, 'year': year_substring}
                query_string = """select * from movies where name ilike '%%'||%(title)s||'%%' and year = %(year)s"""

            if mode == 'Частичное совпадение по словам + Год':
                year_substring = self.ui.lineEdit_year.text()
                year_substring = year_substring if year_substring != '' else None
                params = {
                    'title' + str(i): v
                    for i, v in enumerate(title_substring.split())
                }
                query_string = ' or '.join([
                    """name ilike '%%'||%({})s||'%%'""".format(t)
                    for t in params.keys()
                ])
                params.update({'year': year_substring})
                query_string = 'select * from movies where ({}) and year = %(year)s'.format(
                    query_string, year_substring)

            # PostgreSQL connection
            if self.ui.radioButton_PostgreSQL.isChecked():
                con = psycopg2.connect(user='******',
                                       password=self.db_password,
                                       host='db.mirvoda.com',
                                       port='5454',
                                       dbname='information_retrieval')
                se = 'PostgreSQL'

            # SQLite connection
            else:
                con = sqlite3.connect(PATH + '/imdb.db')
                se = 'SQLite'

            df = pd.read_sql(query_string, con, params=params).head(LIMIT)
            con.close()
            df = df.fillna('').astype(str)
            df['year'] = df['year'].apply(lambda x: x.replace('.0', ''))

        # Lucene
        else:
            lucene.initVM()
            index_dir = SimpleFSDirectory(Paths.get('index'))
            reader = DirectoryReader.open(index_dir)
            searcher = IndexSearcher(reader)
            query_string = ''
            se = 'Lucene'
            if mode == 'Полное совпадение':
                query_string = 'name:"{}"'.format(title_substring)

            if mode == 'Частичное совпадение':
                query_string = 'name:{}'.format(title_substring)

            if mode == 'Частичное совпадение по словам':
                query_string = ' or '.join([
                    """name:{}""".format(ss) for ss in title_substring.split()
                ])

            if mode == 'Полное совпадение + Год':
                year_substring = self.ui.lineEdit_year.text()
                query_string = 'name:"{}" AND year:"{}"'.format(
                    title_substring, year_substring)

            if mode == 'Частичное совпадение + Год':
                year_substring = self.ui.lineEdit_year.text()
                query_string = 'name:{} AND year:"{}"'.format(
                    title_substring, year_substring)

            if mode == 'Частичное совпадение по словам + Год':
                year_substring = self.ui.lineEdit_year.text()
                query_string = ' or '.join([
                    """name:{}""".format(ss) for ss in title_substring.split()
                ])
                query_string = '({}) and year:"{}"'.format(
                    query_string, year_substring)

            query = QueryParser("defaultField",
                                StandardAnalyzer()).parse(query_string)
            hits = searcher.search(query, LIMIT)

            df = pd.DataFrame()
            for hit in hits.scoreDocs:
                doc = searcher.doc(hit.doc)
                df = df.append(
                    [[doc.get('id'),
                      doc.get('name'),
                      doc.get('year')]],
                    ignore_index=True)
            if not df.empty:
                df.columns = ['id', 'name', 'year']

        pandas_model = PandasModel(df)
        self.tableView.setModel(pandas_model)
        self.tableView.horizontalHeader().setSectionResizeMode(1)

        t2 = time.time()
        self.statusBar().showMessage('Searched [{}] with {} for {} s'.format(
            query_string, se, str(t2 - t1)))

        logging.info('Searched [{}] with {} for {} s'.format(
            query_string, se, str(t2 - t1)))
        logging.info(df)
        logging.info(
            '---------------------------------------------------------')
示例#15
0
def main():
    global lucene_vm_init
    if not lucene_vm_init:
        lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        lucene_vm_init = True

    is_index_Exist = os.path.exists(LUCENE_INDEX_DIR)
    # specify index path
    index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR))

    # configure search engine
    analyzer = StandardAnalyzer()
    config = IndexWriterConfig(analyzer)

    # load index to search engine
    reader = DirectoryReader.open(index_mm)
    searcher = IndexSearcher(reader)

    # read query
    read_query()

    # initialize mongodb client
    mongoObj = Mongo_Object('localhost', 27017)

    # initialize word2vec
    print 'load word2vec model'
    w2vmodel = gensim.models.Word2Vec.load_word2vec_format(
        "F:\\modified_w2v\\w2v_wiki_trigram_phrase_20170101\\wiki.en.text.vector.binary",
        binary=True)
    print 'finish loading word2vec model'

    # search
    global hitsPerPage
    fields = ['name', 'value']
    #parser=MultiFieldQueryParser(fields,analyzer)
    #parser.setDefaultOperator(QueryParserBase.AND_OPERATOR)
    rec_result = open('pylucene.runs', 'w')

    for i in range(len(queries)):
        query = queries[i]
        print 'processing query ' + str(i) + ':' + query[0]
        querystr = remove_duplicate(stemSentence(query[1]))
        #q_lucene=MultiFieldQueryParser.parse(parser,querystr)
        q_lucene = QueryParser("all_text", analyzer).parse(querystr)
        print "q_lucene: " + q_lucene.toString()
        collector = TopScoreDocCollector.create(hitsPerPage)
        searcher.search(q_lucene, collector)
        hits = collector.topDocs().scoreDocs

        # build query object for computeScore
        queryObj = Query_Object(query, mongoObj, w2vmodel)

        # initialize duplicate remover
        docDup = set()

        # find candidate results after 1st round filter
        candidates = PriorityQueue()
        for j in range(len(hits)):
            docID = hits[j].doc
            d = searcher.doc(docID)
            name = cleanSentence(d['title'].strip())
            if name in docDup:
                continue
            docDup.add(name)
            # build entity object
            entityObj = Entity_Object(d, mongoObj, w2vmodel)
            score = computeScore(queryObj, entityObj, mongoObj, w2vmodel)
            #score=hits[j].score
            candidates.put((-score, j))

        # output results from priority queue larger score first
        rank = 0
        while candidates.empty() == False and rank < 100:
            rank = rank + 1
            item = candidates.get()
            score = -item[0]
            j = item[1]  # index of hits[]
            docID = hits[j].doc
            d = searcher.doc(docID)
            title = '<dbpedia:' + d.get('title') + '>'
            res_line = query[0] + '\t' + 'Q0' + '\t' + title + '\t' + str(
                rank) + '\t' + str(score) + '\t' + 'pylucene_multifield'
            rec_result.writelines(res_line + '\n')
    rec_result.close()
示例#16
0
def superSearch(command, command_dict, urlclick):
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    directory = SimpleFSDirectory(File('index2.3'))
    print "run super search..."
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = SimpleAnalyzer(Version.LUCENE_CURRENT)
    command = ' '.join(jieba.cut_for_search(command))
    querys = BooleanQuery()
    if command:
        query = QueryParser(Version.LUCENE_CURRENT, 'nameforsearch',
                            analyzer).parse(command)
        querys.add(query, BooleanClause.Occur.SHOULD)
    for k, v in (command_dict[0]).items():
        query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
        query.setBoost(0.1)
        querys.add(query, BooleanClause.Occur.MUST)
    for k, v in (command_dict[1]).items():
        query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
        querys.add(query, BooleanClause.Occur.MUST_NOT)
    scoreDocs = searcher.search(querys, 10000).scoreDocs
    swxc_res = findres(command + ' ' + command_dict[0].get("ingredient", ''),
                       scoreDocs, searcher)
    formatter = SimpleHTMLFormatter("<span style='color:red'>", "</span>")
    formatter_name = SimpleHTMLFormatter("<span,style='color:red'>", "</span>")
    if command:
        scorer = QueryScorer(
            QueryParser(Version.LUCENE_CURRENT, 'name',
                        analyzer).parse(command))
        highlighters = [Highlighter(formatter_name, scorer)]
    else:
        highlighters = ['']
    if command_dict[0].get('ingredient'):
        highlighters.append(
            Highlighter(
                formatter,
                QueryScorer(
                    QueryParser(Version.LUCENE_CURRENT, 'ingredient',
                                analyzer).parse(
                                    command_dict[0]['ingredient']))))
    else:
        highlighters.append('')
    if command_dict[0].get('taste'):
        highlighters.append(
            Highlighter(
                formatter,
                QueryScorer(
                    QueryParser(Version.LUCENE_CURRENT, 'taste',
                                analyzer).parse(command_dict[0]['taste']))))
    else:
        highlighters.append('')
    if command_dict[0].get('tech'):
        highlighters.append(
            Highlighter(
                formatter,
                QueryScorer(
                    QueryParser(Version.LUCENE_CURRENT, 'tech',
                                analyzer).parse(command_dict[0]['tech']))))
    else:
        highlighters.append('')
    fragmenter = SimpleFragmenter(1000)
    for h in highlighters:
        if h:
            h.setTextFragmenter(fragmenter)

    results = []
    for scoreDoc in scoreDocs:
        if (scoreDoc.score * len(scoreDocs) < 200
                and len(scoreDocs) > 200) or scoreDoc.score < 0.1:
            continue
        doc = searcher.doc(scoreDoc.doc)
        if command:
            highlighterContent = highlighters[0].getBestFragment(
                analyzer, 'name', doc.get('name'))
        else:
            highlighterContent = ''
        if highlighters[1]:
            highlighterContent2 = highlighters[1].getBestFragment(
                analyzer, 'ingredient', doc.get('ingredient'))
        else:
            highlighterContent2 = ''
        if highlighters[2]:
            highlighterContent3 = highlighters[2].getBestFragment(
                analyzer, 'taste', doc.get('taste'))
        else:
            highlighterContent3 = ''
        if highlighters[3]:
            highlighterContent4 = highlighters[3].getBestFragment(
                analyzer, 'tech', doc.get('tech'))
        else:
            highlighterContent4 = ''

        if highlighterContent:
            highlighterContent = highlighterContent.replace(' ', '')
            highlighterContent = highlighterContent.replace(',', ' ')
        else:
            highlighterContent = doc.get('name').replace(' ', '')
        if highlighterContent2:
            highlighterContent2 = highlighterContent2.replace(',', '')
        else:
            highlighterContent2 = (doc.get('ingredient')).replace(',', '')
        if highlighterContent3:
            pass
        else:
            highlighterContent3 = doc.get('taste')
        if highlighterContent4:
            pass
        else:
            highlighterContent4 = doc.get('tech')
        results.append(
            (highlighterContent, doc.get('img'),
             doc.get('content').replace(' ', ''),
             highlighterContent2, highlighterContent3, highlighterContent4,
             doc.get('others').replace(',',
                                       ''), doc.get('url'), scoreDoc.score))

        for i in range(0, min(20, len(results)) - 1):
            flag = True
            for j in range(0, min(20, len(results)) - i - 1):
                if abs(results[j][8] - results[j + 1][8]) < 0.1 and urlclick[
                        results[j][7]] < urlclick[results[j + 1][7]]:
                    flag = False
                    results[j], results[j + 1] = results[j + 1], results[j]
            if flag:
                break

    return results, swxc_res
示例#17
0
	new_query = ''
	for i in xrange(len(new_qt)):
		new_query += new_qt[i]
	
	number_of_relevants = 0
	for k in relevance[nq].keys():
#print relevance[nq][k]
		number_of_relevants += relevance[nq][k]

#	new_query = ''
#	for i in xrange(len(important_words)):
#		new_query += important_words[i] + ' '
		
	print "New query: ",new_query
	query = QueryParser("contents",analyzer).parse(new_query)
	scoreDocs = searcher.search(query,10).scoreDocs
	total_rel = 0
	ap = 0.
	nn = 0
	for d in scoreDocs:
		nn += 1
		doc = searcher.doc(d.doc)
		docname = doc.get("name")
		rel = 0
		if docname.strip() in relevance[nq]:
			rel = relevance[nq][docname.strip()]
		total_rel += rel
		if (rel == 1):
			ap += float(total_rel)/float(nn)
		print docname + " " + str(rel)
示例#18
0
def run(searcher, analyzer, command, urlclick):

    if command == '':
        return []
    res = firstsearch(searcher, analyzer, command)
    command = ''.join(my_jieba.cut(command))
    command = " ".join(jieba.cut(command, cut_all=True))
    if len(res) > 0:
        scoreDocs = res
    else:
        querys = BooleanQuery()
        for k in tag:
            query = QueryParser(Version.LUCENE_CURRENT, k,
                                analyzer).parse(command)
            if k == 'taste' or k == 'tech':
                query.setBoost(0.5)
            querys.add(query, BooleanClause.Occur.SHOULD)
        scoreDocs = searcher.search(querys, 10000).scoreDocs

    swxc_res = findres(command, scoreDocs, searcher)
    formatter = SimpleHTMLFormatter("<span style='color:red'>", "</span>")
    formatter_name = SimpleHTMLFormatter("<span,style='color:red'>", "</span>")
    scorer = QueryScorer(
        QueryParser(Version.LUCENE_CURRENT, 'name', analyzer).parse(command))
    highlighter1 = Highlighter(formatter_name, scorer)
    highlighter2 = Highlighter(
        formatter_name,
        QueryScorer(
            QueryParser(Version.LUCENE_CURRENT, 'content',
                        analyzer).parse(command)))
    highlighter3 = Highlighter(
        formatter,
        QueryScorer(
            QueryParser(Version.LUCENE_CURRENT, 'ingredient',
                        analyzer).parse(command)))
    highlighter4 = Highlighter(
        formatter,
        QueryScorer(
            QueryParser(Version.LUCENE_CURRENT, 'taste',
                        analyzer).parse(command)))
    highlighter5 = Highlighter(
        formatter,
        QueryScorer(
            QueryParser(Version.LUCENE_CURRENT, 'tech',
                        analyzer).parse(command)))
    highlighter6 = Highlighter(
        formatter,
        QueryScorer(
            QueryParser(Version.LUCENE_CURRENT, 'others',
                        analyzer).parse(command)))

    fragmenter = SimpleFragmenter(1000)
    highlighter1.setTextFragmenter(fragmenter)
    highlighter2.setTextFragmenter(fragmenter)
    highlighter3.setTextFragmenter(fragmenter)
    highlighter4.setTextFragmenter(fragmenter)
    highlighter5.setTextFragmenter(fragmenter)
    highlighter6.setTextFragmenter(fragmenter)

    results = []
    for scoreDoc in scoreDocs:
        if (len(scoreDocs) > 200 and
                len(scoreDocs) * scoreDoc.score < 2) or scoreDoc.score < 0.002:
            continue
        doc = searcher.doc(scoreDoc.doc)

        highlighterContent = highlighter1.getBestFragment(
            analyzer, 'name', doc.get('name'))
        highlighterContent2 = highlighter2.getBestFragment(
            analyzer, 'content', doc.get('content'))
        highlighterContent3 = highlighter3.getBestFragment(
            analyzer, 'ingredient', doc.get('ingredient'))
        highlighterContent4 = highlighter4.getBestFragment(
            analyzer, 'taste', doc.get('taste'))
        highlighterContent5 = highlighter5.getBestFragment(
            analyzer, 'tech', doc.get('tech'))
        highlighterContent6 = highlighter6.getBestFragment(
            analyzer, 'others', doc.get('others'))

        if highlighterContent:
            highlighterContent = highlighterContent.replace(' ', '')
            highlighterContent = highlighterContent.replace(',', ' ')
        else:
            highlighterContent = doc.get('name').replace(' ', '')

        if highlighterContent2:
            highlighterContent2 = highlighterContent2.replace(' ', '')
            highlighterContent2 = highlighterContent2.replace(',', ' ')
        else:
            highlighterContent2 = doc.get('content').replace(' ', '')
        if highlighterContent3:
            highlighterContent3 = highlighterContent3.replace(',', '')
        else:
            highlighterContent3 = (doc.get('ingredient')).replace(',', '')
        if highlighterContent4:
            pass
        else:
            highlighterContent4 = doc.get('taste')
        if highlighterContent5:
            pass
        else:
            highlighterContent5 = doc.get('tech')
        if highlighterContent6:
            highlighterContent6 = highlighterContent6.replace(',', '')
        else:
            highlighterContent6 = (doc.get('others')).replace(',', '')

        results.append(
            (highlighterContent, doc.get('img'), highlighterContent2,
             highlighterContent3, highlighterContent4, highlighterContent5,
             highlighterContent6, doc.get('url'), scoreDoc.score))

        for i in range(0, min(20, len(results)) - 1):
            flag = True
            for j in range(0, min(20, len(results)) - i - 1):
                if abs(results[j][8] - results[j + 1][8]) < 0.1 and urlclick[
                        results[j][7]] < urlclick[results[j + 1][7]]:
                    flag = False
                    results[j], results[j + 1] = results[j + 1], results[j]
            if flag:
                break

    return results, swxc_res
示例#19
0
 def former_search(former):
     query = QueryParser(Version.LUCENE_CURRENT, "former",
                         analyzer).parse(command)
     return query
示例#20
0
 def sentenceCountForQuery(self, query, field='text'):
     qp = QueryParser(Version.LUCENE_CURRENT, field, self.analyzer).parse(query)
     collector = TotalHitCountCollector()
     self.searcher.search(qp, collector)
     return collector.getTotalHits()
示例#21
0
 def last_search(last):
     query = QueryParser(Version.LUCENE_CURRENT, "last",
                         analyzer).parse(command)
     return query
示例#22
0
 def search(self, qstring):
     query = QueryParser("web", self.analyzer).parse(qstring)
     scoreDocs = self.searcher.search(query, 50).scoreDocs
     return [self.searcher.doc(score_doc.doc) for score_doc in scoreDocs]
示例#23
0
 def parse_query(query, fieldname):
     query_parser_obj = QueryParser(fieldname, StandardAnalyzer())
     query_parser = query_parser_obj.parse(query)
     return query_parser
示例#24
0
def Run_Price(searcher_good, searcher_bad, analyzer, command, brand):
    while True:
        command_dict, low, high = parseCommand(command, brand)
        total_num = 20

        s = SortField("price", SortField.Type.FLOAT, False)
        #s=SortField("total_comment",SortField.Type.FLOAT,True)
        #s=SortField("good_rate",SortField.Type.FLOAT,True)
        #s=SortField("socre",SortField.Type.FLOAT,True)
        so = Sort(s)
        querys = BooleanQuery()
        for k, v in command_dict.iteritems():
            query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
            querys.add(query, BooleanClause.Occur.MUST)

        #The price's range
        q = NumericRangeQuery.newFloatRange("price", low, high, True, True)
        querys.add(q, BooleanClause.Occur.MUST)

        scoreDocs_good = searcher_good.search(querys, total_num, so).scoreDocs
        total = len(scoreDocs_good)
        flag = True
        if len(scoreDocs_good) < total_num:
            scoreDocs_bad = searcher_bad.search(querys, total_num,
                                                so).scoreDocs
            total = total + len(scoreDocs_bad)
            flag = False
        if total > total_num:
            total = total_num
        #Total is the number of matched websites
        res = []
        for scoreDoc_good in scoreDocs_good:
            unit = []
            doc = searcher_good.doc(scoreDoc_good.doc)
            title = doc.get('title')
            title.replace(' ', '')
            title = title[:18]
            total_comment = doc.get("total_comment")
            price = doc.get("price")
            socre = doc.get("socre")
            brand = doc.get("brand")
            good_rate = doc.get("good_rate")
            url = doc.get("url")
            img_url = doc.get("img_url")
            comment = doc.get("comment").split()
            unit.append(title)  #0
            unit.append(total_comment)  #1
            unit.append(price)  #2
            unit.append(socre)  #3
            unit.append(brand)  #4
            unit.append(good_rate)  #5
            unit.append(url)  #6
            unit.append(img_url)  #7
            unit.append(comment)  #8
            res.append(unit)
        if not flag:
            t = 0
            for scoreDoc_bad in scoreDocs_bad:
                t = t + 1
                doc = searcher_bad.doc(scoreDoc_bad.doc)
                ##                explanation = searcher.explain(query, scoreDoc.doc)
                title = doc.get('title')
                title.replace(' ', '')
                title = title[:18]
                total_comment = doc.get("total_comment")
                price = doc.get("price")
                socre = doc.get("socre")
                brand = doc.get("brand")
                good_rate = doc.get("good_rate")
                url = doc.get("url")
                img_url = doc.get("img_url")
                comment = doc.get("comment").split()
                unit.append(title)
                unit.append(total_comment)
                unit.append(price)
                unit.append(socre)
                unit.append(brand)
                unit.append(good_rate)
                unit.append(url)
                unit.append(img_url)
                unit.append(comment)
                res.append(unit)
                if t > total_num - 1 - len(scoreDocs_good):
                    break
        res.append(brand)
        return res
示例#25
0
    def __init__(self, *args):
        super(HighlighterTestCase, self).__init__(*args)

        self.parser = QueryParser(self.FIELD_NAME, StandardAnalyzer())
示例#26
0
    elif o == "--index":
        indexDir = a
    elif o == "--stats":
        stats = True


class CustomTemplate(Template):
    delimiter = '#'

template = CustomTemplate(format)

fsDir = SimpleFSDirectory(File(indexDir))
searcher = IndexSearcher(DirectoryReader.open(fsDir))

analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
parser = QueryParser(Version.LUCENE_CURRENT, "keywords", analyzer)
parser.setDefaultOperator(QueryParser.Operator.AND)
query = parser.parse(' '.join(args))
start = datetime.now()
scoreDocs = searcher.search(query, 51).scoreDocs
duration = datetime.now() - start
if stats:
    print >>sys.stderr, "Found %d document(s) (in %s) that matched query '%s':" %(len(scoreDocs), duration, query)

for scoreDoc in scoreDocs:
    doc = searcher.doc(scoreDoc.doc)
    table = dict((field.name(), field.stringValue())
                 for field in doc.getFields())
    print template.substitute(table)

示例#27
0
    def more_like_this2(self, limit, item_doc, user_query):
        github_result = []
        if not item_doc:
            item_doc.append(ResultItem(None, 1.0, "No Title", 0, 0))

        query = ""
        if item_doc.doc:
            query += self.document_to_query(item_doc.doc)
        query += user_query
        query = remove_unified_stop_lists(query)
        print '................................................................................................'
        print "Project Searcher Unified Query :", query
        print '................................................................................................'
        write_search_log(
            "................................................................................................\n"
            + "Project Searcher Unified Query : " +
            str(query.encode('utf-8')) + "\n" +
            "................................................................................................\n"
        )
        queryparser = QueryParser(Version.LUCENE_CURRENT, "typed_method_call",
                                  self.analyzer)
        if query:
            try:
                like_query = queryparser.parse(query)
                hits = self.searcher.search(like_query,
                                            limit).scoreDocs  #answer 1개당 10개씩
                temp = 1
                for i, hit in enumerate(hits):
                    doc = self.searcher.doc(hit.doc)
                    matched_terms = self.get_matched_keywords2(
                        like_query, hit.doc)
                    #print "Matched Terms : ", matched_terms

                    print("File %s" % temp, doc.get("file"), "//",
                          doc.get("file_content")
                          )  #, "line_numbers", doc.get("line_numbers"))
                    write_search_log("File " + str(temp) +
                                     str(doc.get("file")) + "//" +
                                     str(doc.get("file_content")) + "\n")
                    temp += 1

                    file_path = doc.get("file")
                    print 'file_path = ', file_path
                    content = None
                    try:
                        with open(file_path) as f:
                            content = f.read()
                    except:
                        print "CAN'T OPEN THE FILE"
                        pass

                    if content:
                        item = GithubResultItem(doc.get("file"), content,
                                                matched_terms, hit.score,
                                                item_doc,
                                                doc.get("line_numbers"),
                                                hit.doc)
                        # print item.score
                        github_result.append(item)

            except Exception as e:
                print "GitSearcher Error: %s" % e
                print(traceback.format_exc())

        #sorted(github_result, key=attrgetter())

            print 'github_result : ', github_result
        return github_result
示例#28
0
lucene.initVM(maxheap='8192m')

q = sys.argv[1]
q = '\"adolf hitler\" \"national museum\"'

field = 'content'
index_dir = os.path.join(os.path.expanduser('~'),
                         'github/entityqa/data/index_180629')
print(index_dir)
hitsPerPage = int(sys.argv[2])

reader = DirectoryReader.open(FSDirectory.open(Paths.get(index_dir)))
searcher = IndexSearcherE(reader)

analyzer = StandardAnalyzer()
qparser = QueryParser(field, analyzer)
query = qparser.parse(q)

print("Searching for:", query.toString(field))

topdocs = searcher.searchE(query, 5 * hitsPerPage, 'ent')
topdocs = TopDocsE.cast_(topdocs)
hitEntities = topdocs.scoreDocs
hitDocs = topdocs.entityWeightedDocs
numTotalHits = topdocs.totalHits
numTotalDocs = topdocs.totalDocs

print("{} total matching entities ({} docs)".format(numTotalHits, numTotalDocs))

# retriever.searcher.doc^(hitDocs[0]