Exemplo n.º 1
0
    def runSearch(self, runCount, mainThread=False):
        """ search for runCount number of times """

        # problem: if there are any assertion errors in the child
        #   thread, the calling thread is not notified and may still
        #   consider the test case pass. We are using self.totalQueries
        #   to double check that work has actually been done.

        if not mainThread:
            getVMEnv().attachCurrentThread()
        time.sleep(0.5)

        searcher = self.getSearcher()
        try:
            self.query = PhraseQuery()
            for word, count in self.testData[0:runCount]:
                query = TermQuery(Term("field", word))
                topDocs = searcher.search(query, 50)
                self.assertEqual(topDocs.totalHits, count)

                self.lock.acquire()
                self.totalQueries += 1
                self.lock.release()
        finally:
            del searcher
    def runSearch(self, runCount, mainThread=False):
        """ search for runCount number of times """

        # problem: if there are any assertion errors in the child
        #   thread, the calling thread is not notified and may still
        #   consider the test case pass. We are using self.totalQueries
        #   to double check that work has actually been done.

        if not mainThread:
            getVMEnv().attachCurrentThread()
        time.sleep(0.5)

        searcher = self.getSearcher()
        try:
            self.query = PhraseQuery()
            for word, count in self.testData[0:runCount]:
                query = TermQuery(Term("field", word))
                topDocs = searcher.search(query, 50)
                self.assertEqual(topDocs.totalHits, count)

                self.lock.acquire()
                self.totalQueries += 1
                self.lock.release()
        finally:
            del searcher
Exemplo n.º 3
0
    def run(self):
        owf = "%sresult%s.csv"%(WRITE_DIR,self.i)
        print owf
        
        t = open(owf,"w")

        getVMEnv().attachCurrentThread()
        searcher = lucene.IndexSearcher(directory,True)

        a = 0

        for line in self.content:
            query = lucene.QueryParser(lucene.Version.LUCENE_CURRENT,
                                       'content',analyzer).parse(line)
            results = searcher.search(query,None,1)
    
            score_docs = results.scoreDocs

            b = 0
            for score_doc in score_docs:
                doc = searcher.doc(score_doc.doc)
                b += 1

            result = doc['tag']
        
            t.write("%s,\"%s\"\n"%(self.label[a],result.strip()))
            a += 1
            if a % 10 == 0:
                print "线程%s 完成%s,百分之%s已经完成"%(self.i,a,1.0*a/len(self.content))
Exemplo n.º 4
0
def matchE(request):
    lucene.getVMEnv().attachCurrentThread()
    try:
        student = {}
        student['name'] = request.POST['student_name']
        student['interest'] = \
            request.POST['student_interest']
        student['affiliation'] = \
            request.POST['student_affiliation']
    except KeyError:
        return render_to_response('index.html',
                {'error_msg':'missing field'},
                context_instance=RequestContext(request))
    else:
        prof_matcher = matcher()
        prof_list = prof_matcher.getProfMatch(student)
        request.session['prof_list'] = prof_list
        request.session['student'] = student
	info_list = []
	for i,prof in enumerate(prof_list):
        	score,explainList = prof_matcher.explainPos(i+1)
		info_list.append((prof,score,explainList))
	for prof in prof_list:
            print prof['name']
            aff_count = prof['affiliation'].count(student['affiliation'])
            prof['co_count'] = aff_count
        student = request.session.get('student')
        print 'in match', student, prof_list[0].get('name')
        return render_to_response('explain.html', {'info_list':info_list,'student':student})
Exemplo n.º 5
0
    def __init__(self, indexDir, doClear=True, computeLengthNorm=False):
        #         if not jpype.isJVMStarted():
        #         lucene.initVM()
        lucene.getVMEnv().attachCurrentThread()
        self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        # self.analyzer = LimitTokenCountAnalyzer(self.analyzer, 100678)#is here?
        self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer)
        self.config.setRAMBufferSizeMB(256.0)  # 设置自动提交的最大RAM为256MB
        self.config.setMaxBufferedDocs(10000)  # 设置自动提交的最大Docs个数为10000
        if not computeLengthNorm:
            sim = CustomSimilarity()
            self.config.setSimilarity(sim)
        self.path = os.path.join(INDEX_PATH, indexDir)
        # print self.path
        # path.mkdir(self.path)
        #         if doClear:
        #             self.clearExistingIndex()
        self.store = SimpleFSDirectory(File(self.path))
        self.writer = IndexWriter(self.store, self.config)

        self.t1 = FieldType()  # 域t1
        self.t1.setIndexed(True)
        self.t1.setStored(True)
        self.t1.setTokenized(False)
        self.t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

        self.t2 = FieldType()  # 域t2
        self.t2.setIndexed(True)
        self.t2.setStored(False)
        self.t2.setTokenized(True)
        self.t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
Exemplo n.º 6
0
def matchE(request):
    lucene.getVMEnv().attachCurrentThread()
    try:
        student = {}
        student['name'] = request.POST['student_name']
        student['interest'] = \
            request.POST['student_interest']
        student['affiliation'] = \
            request.POST['student_affiliation']
    except KeyError:
        return render_to_response('index.html', {'error_msg': 'missing field'},
                                  context_instance=RequestContext(request))
    else:
        prof_matcher = matcher()
        prof_list = prof_matcher.getProfMatch(student)
        request.session['prof_list'] = prof_list
        request.session['student'] = student
        info_list = []
        for i, prof in enumerate(prof_list):
            score, explainList = prof_matcher.explainPos(i + 1)
            info_list.append((prof, score, explainList))
        for prof in prof_list:
            aff_count = prof['affiliation'].count(student['affiliation'])
            prof['co_count'] = aff_count
        student = request.session.get('student')
        return render_to_response('explain.html', {
            'info_list': info_list,
            'student': student
        })
Exemplo n.º 7
0
def initIndex(tbl):
    lucene.getVMEnv().attachCurrentThread()
    writer = getWriter(getStore(), getAnalyzer(), True)
    STORE = lucene.Field.Store.YES
    COMPRESS = lucene.Field.Store.COMPRESS
    TOKENIZED = lucene.Field.Index.TOKENIZED
    UN_TOKENIZED = lucene.Field.Index.UN_TOKENIZED
    rowset = tbl.select()
    acc = 0
    for row in rowset:
        acc += 1
        if acc == 100:
            acc = 0
            sys.stdout.write(".")
            sys.stdout.flush()
        #Begin Lucene copy section
        doc = lucene.Document()
        doc.add(lucene.Field("id", unicode(row.id), STORE, UN_TOKENIZED))
        doc.add(lucene.Field('data', unicode(row.data), COMPRESS, TOKENIZED))
        doc.add(lucene.Field('source', unicode(row.source), COMPRESS, TOKENIZED))
        #End Lucene copy section
        writer.addDocument(doc)
    print "|"
    writer.optimize(True)
    writer.close()
Exemplo n.º 8
0
def process_q_test(q, out_q):
    lucene.initVM()
    lucene.getVMEnv().attachCurrentThread()

    index = DirectoryReader.open(SimpleFSDirectory(
        Paths.get(robust_index_dir)))
    searcher = IndexSearcher(index)
    searcher.setSimilarity(BM25Similarity())
    analyzer = EnglishAnalyzer()
    qparser = QueryParser("contents", analyzer)
    preprocessor = Preprocess()

    while not exitFlag:
        qid, query = q.get()
        tname = multiprocessing.current_process().name
        # print(tname, qid, query)
        if query == "DONE":
            break

        try:
            # dids, scores = get_lm_matched_docs(query, searcher, qparser, 2000)
            # if len(dids) >= 10:
            #     out_q.put((qid, dids, scores))
            dids_text = get_lm_doc_snippets(query, searcher, qparser, analyzer,
                                            preprocessor)
            out_q.put((qid, dids_text))
        except:
            print('%s exception %s, %s' % (tname, qid, query))
Exemplo n.º 9
0
def reindex(row):
    if "id" not in row.__dict__:
        return
    lucene.getVMEnv().attachCurrentThread()
    try:
        writer = getWriter(getStore(), getAnalyzer())
        #print "got Writer"
        STORE = lucene.Field.Store.YES
        COMPRESS = lucene.Field.Store.COMPRESS
        TOKENIZED = lucene.Field.Index.TOKENIZED
        UN_TOKENIZED = lucene.Field.Index.UN_TOKENIZED
        #Begin Lucene copy section
        doc = lucene.Document()
        #print "created new document"
        doc.add(lucene.Field("id", unicode(row.id), STORE, UN_TOKENIZED))
        #print "added id field"
        doc.add(lucene.Field('data', unicode(row.data), COMPRESS, TOKENIZED))
        doc.add(lucene.Field('source', unicode(row.source), COMPRESS, TOKENIZED))
        #End Lucene copy section
        writer.deleteDocuments(lucene.Term("id", unicode(row.id)))
        #print "deleted existing document"
        writer.addDocument(doc)
        #print "added document"
        writer.optimize(True)
        #print "optimized index"
        writer.close()
        #print "closed writer"
    except:
        print "Failed in reindex of " + unicode(row.id) + "!"
        delLock()
Exemplo n.º 10
0
def query_network():
    """Handle API request '/network'.

    API Request Parameters
    ----------------------
        ids : list of int
        nodes_limit : int
        edges_limit : int
        include_user_mentions : bool

    API Response Keys
    -----------------
        status : string
        num_of_entries : int
        edges : dict
            canonical_url : string
            date_published : string formatted datetime
            domain : string
            from_user_id : string
            from_user_screen_name : string
            id : int
            is_mention : bool
            site_type : {'claim', 'fact_checking'}
            title : string
            to_user_id : string
            to_user_screen_name : string
            tweet_created_at : string formatted datetime
            tweet_id: string
            tweet_type: {'origin', 'retweet', 'quote', 'reply'}
    """
    lucene.getVMEnv().attachCurrentThread()
    q_network_schema = Schema({
        'ids': Use(flask.json.loads),
        Optional('nodes_limit', default=1000): And(Use(int), lambda i: i > 0),
        Optional('edges_limit', default=12500): And(Use(int), lambda i: i > 0),
        Optional('include_user_mentions', default=True): And(
            unicode, Use(lambda s: s.lower()),
            lambda s: s in ('true', 'false'),
            Use(lambda s: True if s == 'true' else False)),
    })
    q_kwargs = copy_req_args(request.args)
    try:
        q_kwargs = q_network_schema.validate(q_kwargs)
        df = db_query_network(engine, **q_kwargs)
        if len(df) == 0:
            raise APINoResultError('No edge could be built!')
        response = dict(status='OK',
                        num_of_entries=len(df),
                        edges=flask.json.loads(df.to_json(**TO_JSON_KWARGS)))
    except SchemaError as e:
        response = dict(status='ERROR', error=str(e))
    except APINoResultError as e:
        response = dict(status='No result error', error=str(e))
    except Exception as e:
        logger.exception(e)
        response = dict(status='ERROR', error='Server error, query failed')
    return flask.jsonify(response)
	def get_instance():
		""" Static access method. """
		if QueryLuceneManager.__instance is None:
			lucene.initVM()
			QueryLuceneManager.__instance = QueryLucene()

		lucene.getVMEnv().attachCurrentThread()

		return QueryLuceneManager.__instance
Exemplo n.º 12
0
 def __getitem__(self, key):
     try:
         indexer = super(GetIndexers, self).__getitem__(key)
         self[key] = indexer
     except KeyError:
         indexer = db_indexers.get(key, None)
         if not indexer:
             raise KeyError, "Database not found"
         lucene.getVMEnv().attachCurrentThread()
     return indexer
def SearchFiles(command):
    STORE_DIR = "lucene/index"
    getVMEnv().attachCurrentThread()
    # print 'lucene', VERSION
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(directory, True)
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    rankedfiles = run(searcher, analyzer, command)
    searcher.close()
    return rankedfiles
Exemplo n.º 14
0
def query_top_spreaders():
    """Handle API request '/top-user'.

    API Request Parameters
    ----------------------
        upper_day : string formatted datetime
        most_recent : bool

    API Response Keys
    -----------------
        status : string
        num_of_entries : int
        spreaders : dict
            bot_score : float
            number_of_tweets : int
            site_type : {'claim', 'fact_checking'}
            spreading_type : {'active', 'influencial'}
            upper_day : string formatted datetime
            user_id : int
            user_raw_id : string
            user_screen_name : string

    """
    lucene.getVMEnv().attachCurrentThread()
    yesterday = datetime.utcnow().date() - timedelta(days=1)
    yesterday = yesterday.strftime('%Y-%m-%d')

    q_top_spreaders_schema = Schema({
        Optional('upper_day', default=yesterday):
        And(Regex('^\d{4}-\d{2}-\d{2}$'),
            Use(dateutil.parser.parse),
            error='Invalid date, should be yyyy-mm-dd format'),
        Optional('most_recent', default=True):
        And(unicode,
            Use(lambda s: s.lower()), lambda s: s in ('true', 'false'),
            Use(lambda s: True if s == 'true' else False)),
    })
    q_kwargs = copy_req_args(request.args)
    try:
        q_kwargs = q_top_spreaders_schema.validate(q_kwargs)
        df = db_query_top_spreaders(engine, **q_kwargs)
        if len(df) == 0:
            raise APINoResultError('No top spreader found!')
        response = dict(
            status='OK',
            num_of_entries=len(df),
            spreaders=flask.json.loads(df.to_json(**TO_JSON_KWARGS)))
    except SchemaError as e:
        response = dict(status='ERROR', error=str(e))
    except APINoResultError as e:
        response = dict(status='No result error', error=str(e))
    except Exception as e:
        logger.exception(e)
        response = dict(status='ERROR', error='Server error, query failed')
    return flask.jsonify(response)
Exemplo n.º 15
0
def rowDeleted(*args, **kw):
	print "id: " + str(args[0].id) + " is scheduled for termintation"
	lucene.getVMEnv().attachCurrentThread()
	try:
		writer = getWriter(getStore(), getAnalyzer())
		writer.deleteDocuments(lucene.Term("id", unicode(args[0].id)))
		writer.optimize(True)
		writer.close()
	except:
		print "Failed in deletion of " + unicode(args[0].id) + " from lucene"
		delLock()
Exemplo n.º 16
0
 def run(self):
     print("Starting " + self.name)
     lucene.getVMEnv().attachCurrentThread()
     index = DirectoryReader.open(
         SimpleFSDirectory(Paths.get(robust_index_dir)))
     searcher = IndexSearcher(index)
     searcher.setSimilarity(BM25Similarity())
     analyzer = EnglishAnalyzer()
     qparser = QueryParser("contents", analyzer)
     # process_query(self.name, self.q, self.out_q, searcher, qparser)
     print("Exiting " + self.name)
Exemplo n.º 17
0
def query_top_articles():
    """Handle API request 'top-articles'

    API Request Parameters
    ----------------------
        upper_day : string formatted datetime
        most_recent : bool

    API Response Keys
    -----------------
        status : string
        num_of_entries : int
        articles : dict
            canonical_url : string
            date_captured : string formatted datetime
            number_of_tweets : int
            site_type : {'claim', 'fact_checking'}
            title : string
            upper_day : string formatted datetime
    """
    lucene.getVMEnv().attachCurrentThread()
    yesterday = datetime.utcnow().date() - timedelta(days=1)
    yesterday = yesterday.strftime('%Y-%m-%d')
    q_top_article_schema = Schema({
        Optional('upper_day', default=yesterday):
        And(Regex('^\d{4}-\d{2}-\d{2}$'),
            Use(dateutil.parser.parse),
            error='Invalid date, shoul be yyyy-mm-dd format'),
        Optional('most_recent', default=True):
        And(unicode,
            Use(lambda s: s.lower()), lambda s: s in ('true', 'false'),
            Use(lambda s: True if s == 'true' else False)),
        Optional('exclude_tags', default=[]):
        And(Use(eval), error='Invalid exclude_tags input format'),
    })
    q_kwargs = copy_req_args(request.args)
    try:
        q_kwargs = q_top_article_schema.validate(q_kwargs)
        df = db_query_top_articles(engine, **q_kwargs)
        if len(df) == 0:
            raise APINoResultError('No top article found!')
        response = dict(
            status='OK',
            num_of_entries=len(df),
            articles=flask.json.loads(df.to_json(**TO_JSON_KWARGS)))
    except SchemaError as e:
        response = dict(status='ERROR', error=str(e))
    except APINoResultError as e:
        response = dict(status='No result error', error=str(e))
    except Exception as e:
        logger.exception(e)
        response = dict(status='ERROR', error='Server error, query failed')
    return flask.jsonify(response)
Exemplo n.º 18
0
    def multiFieldsSearch(self, query, sim):
        lucene.getVMEnv().attachCurrentThread()

        parser = MultiFieldQueryParser(
            ["content_section", "title_section", 'title_article'],
            self.analyzer)
        parser.setDefaultOperator(QueryParserBase.OR_OPERATOR)
        query = MultiFieldQueryParser.parse(parser, QueryParser.escape(query))

        self.searcher.setSimilarity(sim)
        hits = self.searcher.search(query, 6).scoreDocs
        return hits
Exemplo n.º 19
0
def hello(query=None):

    if query:
        lucene.getVMEnv().attachCurrentThread()
        parsed_query = transform(query)
        results = find_results(query, reader)
        return render_template('page.pug',
                               parsed_query=parsed_query,
                               results=results,
                               shown_fragments=3)

    return render_template('page.pug')
Exemplo n.º 20
0
 def startOaiPmh(self, portNumber, oaiJazz, storageComponent, register):
     getVMEnv().attachCurrentThread()
     with Reactor() as reactor:
         server = be(
             (Observable(), (ObservableHttpServer(reactor, portNumber),
                             (OaiPmh(repositoryName='repositoryName',
                                     adminEmail='adminEmail',
                                     batchSize=2,
                                     supportXWait=True), (register, ), (
                                         oaiJazz,
                                         (register, ),
                                     ), (storageComponent, )))))
         list(compose(server.once.observer_init()))
         self._loopReactor(reactor)
Exemplo n.º 21
0
    def __call__(self, request):
        # Code to be executed for each request before
        # the view (and later middleware) are called.
        try:
            # ge the vm context and use it for this thread
            lucene.getVMEnv().attachCurrentThread()
        except:
            lucene.initVM()

        response = self.get_response(request)

        # Code to be executed for each request/response after
        # the view is called.

        return response
Exemplo n.º 22
0
 def run(cls, args):
     try:
         # print(args)
         args = cls.args_schema.validate(args)
     except SchemaError as e:
         sys.exit(e)
     session = Session()
     # make sure lucene be inited
     lucene.initVM()
     lucene.getVMEnv().attachCurrentThread()
     if args['--index'] is True:
         configure_logging(
             'lucene.index', console_level=args['--console-log-level'])
         mgid = get_or_create_m(
             session,
             MetaInfo,
             data=dict(
                 name='article_group_id_lucene_index',
                 value='0',
                 value_type='int',
                 description='article.group_id used for lucene index'),
             fb_uk='name')
         if args['--mode'] == 'create':
             mgid.set_value(0)
             session.commit()
         logger.debug('Indexing started.. Getting articles..')
         q = """
         SELECT DISTINCT ON (a.group_id) a.id, a.group_id,
             a.canonical_url,
             a.title, a.meta, a.content,
             coalesce(a.date_published, a.date_captured) AS pd,
             s.domain, s.site_type
         FROM article AS a
             JOIN site AS s ON s.id=a.site_id
         WHERE a.site_id IS NOT NULL AND s.is_enabled IS TRUE
             AND a.group_id>:gid
         ORDER BY group_id, pd ASC
         """
         articles_iter = session.execute(
             sqlalchemy.text(q).bindparams(gid=mgid.get_value()))
         cls.index(session, args['--mode'], articles_iter, mgid)
     elif args['--search'] is True:
         configure_logging(
             'lucene.search', console_level=args['--console-log-level'])
         cls.search(args['--query'], args['--top'])
     else:
         print("Unrecognized command!")
         sys.exit(2)
Exemplo n.º 23
0
    def run(self):

        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()

        # yeah, this should be refactored
        if "search" in self.action.keys():
            self.run_searcher(self.action['search'])
        if "delete" in self.action.keys():
            self.delete_index(self.action['delete'])
        if "export_tdm" in self.action.keys():
            self.export_TDM(self.action['export_tdm'])
        if "export_tdm_csv" in self.action.keys():
            self.export_TDM_csv(self.action['export_tdm_csv'])
        if "export_tdm_stm" in self.action.keys():
            self.export_TDM_stm(self.action['export_tdm_stm'])
        if "export_contents" in self.action.keys():
            self.export_contents(self.action['export_contents'])
        if "import_directory" in self.action.keys():
            self.import_directory(self.action['import_directory'])
        if "import_csv" in self.action.keys():
            self.import_csv(self.action['import_csv'])
        if "import_csv_with_content" in self.action.keys():
            self.import_csv_with_content(*self.action['import_csv_with_content'])
        if "rebuild_metadata_cache" in self.action.keys():
            self.rebuild_metadata_cache(*self.action['rebuild_metadata_cache'])
        if "reindex" in self.action.keys():
            self.reindex()
Exemplo n.º 24
0
def func(command):
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    # ------------ #
    STORE_DIR = "index"
    directory = SimpleFSDirectory(File(STORE_DIR))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    searcher = IndexSearcher(DirectoryReader.open(directory))
    # ------------ #
    p = get_d_dimensional_vector(command)
    vp = get_vp(p)
    query = QueryParser(Version.LUCENE_CURRENT, "Vector", analyzer).parse(vp)
    scoreDocs = searcher.search(query, 200).scoreDocs

    dict1 = {}
    result = ""
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        rank = 0.6 * float(doc.get("Likes")) + 0.4 * float(doc.get("Views"))
        ch = doc.get('Page_num') + ' '
        ch += 'data/' + doc.get('Page_num') + '.jpg' + ' '
        ch += doc.get('Page_link') + ' '
        ch += doc.get('Views') + ' '
        ch += doc.get('Likes') + ' '
        tmp_alt = doc.get('Img_alt')
        tmp_alt = '_'.join(tmp_alt.split())
        ch += tmp_alt
        dict1[ch] = rank
    res_list = sorted(dict1.items(), key = lambda item:item[1], reverse = True)
    for i in res_list:
        result += i[0]
        result += ' '
    del searcher
    del analyzer
    return result
def search():
	args = []
	if request.method == 'POST':
		vm_env = lucene.getVMEnv()
		if vm_env == None:
			lucene.initVM(vmargs=['-Djava.awt.headless=true'])
		if request.form['ies']:
			args.append('ies:'+request.form['ies'])
		if request.form['area']:
			args.append('area:'+request.form['area'])
		if request.form['professor']:
			args.append('professor:'+request.form['professor'])
		if request.form['uf']:
			args.append('uf:'+request.form['uf'])
		if request.form['conceito']:
			#args.append('m:'+request.form['conceito']+'d:'+request.form['conceito']+'f:'+request.form['conceito'])
			args.append('m:'+request.form['conceito'])
			args.append('d:'+request.form['conceito'])
	table = []
	if(len(args) > 0): 
		scoreDocs = mansearch.buscar('indexer/',args)
		fsDir = SimpleFSDirectory(File(indexDir))
		searcher = IndexSearcher(DirectoryReader.open(fsDir))
		for scoreDoc in scoreDocs:
			doc = searcher.doc(scoreDoc.doc)
			table.append(dict((field.name(), field.stringValue()) for field in doc.getFields()))
	return render_template('busca.html',table = table)
	
	pass
Exemplo n.º 26
0
def func2(command):
    STORE_DIR = "index1"
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
    res = []
    if command == '':
        return
    query = QueryParser(Version.LUCENE_CURRENT, "zhuliao",
                        analyzer).parse(command)
    scoreDocs = searcher.search(query, 9).scoreDocs
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        try:
            res.append([
                doc.get("name"),
                doc.get("collect_num"),
                doc.get("zhuliao").split(' '),
                doc.get("zuofa").split('\n'),
                doc.get("img_url"),
                doc.get("url")
            ])
        except:
            pass
    res1 = []
    for i in res:
        i[1] = int(i[1])
        res1.append(tuple(i))
    res2 = sorted(res1, cmp=None, key=lambda x: x[1], reverse=True)
    return res2
Exemplo n.º 27
0
    def getTermVectors(route):
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()
        stopWords = []
        stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords)
        analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet)
        directory = SimpleFSDirectory(File(route))

        ireader = IndexReader.open(directory)
        ls = []
        for doc in range(ireader.numDocs()):
            vector = FreqVector()
            vector.vector = []
            vector.freqs = []

            norm = 0.0
            terms = ireader.getTermVector(doc, "content")
            if terms is not None:
                termsEnum = terms.iterator(None)
                for term in BytesRefIterator.cast_(termsEnum):
                    text = term.utf8ToString()
                    tf = 1 + math.log(termsEnum.totalTermFreq(), 2)
                    t = Term("content", term)
                    idf = math.log(ireader.numDocs() / ireader.docFreq(t))
                    vector.vector.append(text)
                    vector.freqs.append(tf * idf)
                    norm += (tf * idf) * (tf * idf)
                ls.append((vector, math.sqrt(norm)))
            else:
                ls.append((vector, 0))
        return ls
Exemplo n.º 28
0
def run(command):
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    STORE_DIR = "index1"
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
    query = QueryParser(Version.LUCENE_CURRENT, "contents",
                        analyzer).parse(analysis(command))
    HighlightFormatter = SimpleHTMLFormatter()
    highlighter = Highlighter(HighlightFormatter, QueryScorer(query))
    scoreDocs = searcher.search(query, 500).scoreDocs
    print "%s total matching documents." % len(scoreDocs)
    result = []
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        print 'path:', doc.get("path"), 'name:', doc.get(
            "name"), 'url:', doc.get("url"), 'title:', doc.get("title")
        text = doc.get('contents')
        highLightText = highlighter.getBestFragment(analyzer, "contents", text)
        if highLightText != None:
            highLightText = ''.join(highLightText.split(' '))
        data = {}
        data['url'] = doc.get("url")
        data['title'] = doc.get('title')
        data['highlight'] = highLightText
        result.append(data)
    return result
Exemplo n.º 29
0
def run_img(command):
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    STORE_DIR = "index2"
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
    querys = BooleanQuery()
    query_content = QueryParser(Version.LUCENE_CURRENT, "urlcontent",
                                analyzer).parse(command)
    query_title = QueryParser(Version.LUCENE_CURRENT, "title",
                              analyzer).parse(command)
    querys.add(query_content, BooleanClause.Occur.SHOULD)
    querys.add(query_title, BooleanClause.Occur.SHOULD)
    scoreDocs = searcher.search(querys, 50).scoreDocs
    if len(scoreDocs) == 0:
        print "WARNING: No result"
    result = []
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        print doc.get("title")
        data = {}
        data['title'] = doc.get('title')
        data['url'] = doc.get('url')
        data['imgurl'] = doc.get('imgurl')
        result.append(data)
    return result
Exemplo n.º 30
0
    def getMostFrequentTermNoStopwords(route, query):
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()
        stopWords = []

        stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords)
        analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet)
        directory = SimpleFSDirectory(File(route))

        ireader = IndexReader.open(directory)

        currentTerm = ""
        currentTermFreq = 0
        for doc in range(ireader.numDocs()):
            terms = ireader.getTermVector(doc, "content")
            if terms is not None:
                termsEnum = terms.iterator(None)
                for term in BytesRefIterator.cast_(termsEnum):
                    text = term.utf8ToString()
                    t = Term("content", term)
                    freq = ireader.totalTermFreq(t)
                    if freq > currentTermFreq and text not in query:
                        currentTerm = text
                        currentTermFreq = freq
        return currentTerm
Exemplo n.º 31
0
    def Qsearch(self,query):
        words = seg.segment(query.strip())
        #words = self.segmentor.segment(query.strip())
        #print ' '.join(words)
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()
        result = QueryParser(Version.LUCENE_CURRENT, "contents",self.analyzer)
        result.setPhraseSlop(0)
        # "\""+' '.join(words)+"\"~0" means words should be continuous
        query = result.parse("\""+' '.join(words)+"\"~0")
        totalHits = self.searcher.search(query, 50)
        #print "%s total matching documents." % totalHits.totalHits
        #return totalHits.totalHits

        for hit in totalHits.scoreDocs:
            #print"Hit Score: ",hit.score, "Hit Doc:",hit.doc, "HitString:",hit.toString()
            doc= self.searcher.doc(hit.doc)
            #print doc.get("name").encode("utf-8")
        #print "----------------------------------------"
        t = Term('contents',' '.join(words))
        #termDocs = ireader.termDocs(t)
        #for tt in termDocs:
        #       print ireader.document(termDocs.docs).getFeildable('neme'),termDocs.freq()
        #print self.reader.totalTermFreq(t)
        return self.reader.totalTermFreq(t)
Exemplo n.º 32
0
def func1(command):
    global vm_env
    STORE_DIR = "index"
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()

    #lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    # base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
    if command == '':
        return []
    command_list = jieba.cut(command)
    command = " ".join(command_list)
    query = QueryParser(Version.LUCENE_CURRENT, "contents",
                        analyzer).parse(command)
    scoreDocs = searcher.search(query, 50).scoreDocs
    result = []
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        doct = {
            'title': doc.get("title"),
            'url': doc.get("url"),
            "sentence": doc.get("sentence")
        }
        result.append(doct)
    del searcher
    return result
Exemplo n.º 33
0
def search(command):

    reordering = 'no'

    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()

    command = escape_lucene_special_chars(command)
    print("Searching for:", command)
    query = QueryParser("body", analyzer).parse(command)
    scoreDocs = searcher.search(query, 100).scoreDocs


    if reordering == 'ups':
        scoreDocs, scores = reorder_ups(scoreDocs, searcher)
    elif reordering == 'long':
        scoreDocs, scores = reorder_long(scoreDocs, searcher)
    elif reordering == 'normups':
        scoreDocs, scores = reorder_normups(scoreDocs, searcher)
    else:
        n_docs = len(scoreDocs)
        scores = {sd.doc: (n_docs-i,) for i,sd in enumerate(scoreDocs)}

    scoreDocs = scoreDocs[:5]

    for sd in scoreDocs:
        print(sd.doc,'\t',scores[sd.doc])

    docs = [searcher.doc(sd.doc) for sd in scoreDocs]
    return [(d.get('name'), d.get('body')) for d in docs]

        
Exemplo n.º 34
0
 def __enter__(self):
     vm_env = lucene.getVMEnv()
     vm_env.attachCurrentThread()
     self.reader = reader
     self.searcher = searcher
     self.analyzer = analyzer
     return self
Exemplo n.º 35
0
def run_music(ID):
    STORE_DIR = "index"
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    query = QueryParser(Version.LUCENE_CURRENT, "id", analyzer).parse(ID)
    scoreDocs = searcher.search(query, 1).scoreDocs

    try:
        scoreDoc = scoreDocs[0]
    except:
        return None
    doc = searcher.doc(scoreDoc.doc)

    item = []
    item.append(doc.get("song_title").encode('utf-8'))
    item.append(doc.get('song_url'))
    item.append(doc.get("singer").encode('utf-8'))
    item.append(doc.get("album").encode('utf-8'))
    item.append(doc.get("album_pic"))
    item.append(doc.get("album_genre").encode('utf-8'))
    item.append(doc.get("lyrics").encode('utf-8'))

    sim_str = doc.get("similar").encode('utf-8')
    sim_list = sim_str.split('+')
    for i in range(3):
        sim_list[i] = sim_list[i].split('*')
    item.append(sim_list)

    del searcher

    return item
Exemplo n.º 36
0
    def getMostFrequentTermStopwords(route, query):
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()
        stopWords = [
            "a", "an", "and", "are", "as", "at", "be", "but", "by", "for",
            "if", "no", "not", "more", "http", "html", "of", "on", "or",
            "such", "that", "the", "their", "then", "there", "these", "they",
            "this", "to", "was", "will", "with", "el", "la", "lo", "los",
            "las", "ante", "con", "sin", "que", "es", "de", "en", "por", "y",
            "los"
        ]

        stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT,
                                              stopWords)
        analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet)
        directory = SimpleFSDirectory(File(route))

        ireader = IndexReader.open(directory)

        currentTerm = ""
        currentTermFreq = 0
        for doc in range(ireader.numDocs()):
            terms = ireader.getTermVector(doc, "content")
            if terms is not None:
                termsEnum = terms.iterator(None)
                for term in BytesRefIterator.cast_(termsEnum):
                    text = term.utf8ToString().encode('UTF-8')
                    t = Term("content", term)
                    freq = ireader.totalTermFreq(t)
                    if freq > currentTermFreq and text not in query:
                        currentTerm = text
                        currentTermFreq = freq

        return currentTerm
Exemplo n.º 37
0
    def createIndexNoStopwords(texts, route, rebuild):
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()

        stopWords = []

        stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT,
                                              stopWords)
        analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet)
        directory = SimpleFSDirectory(File(route))
        conf = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
        if rebuild:
            conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        else:
            conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)

        iwriter = IndexWriter(directory, conf)

        for key in texts:
            doc = Document()
            doc.add(
                Field("docName", "doc", Field.Store.YES,
                      Field.Index.NOT_ANALYZED))
            doc.add(
                Field("content", texts[key], Field.Store.YES,
                      Field.Index.ANALYZED, Field.TermVector.YES))
            iwriter.addDocument(doc)

        iwriter.close()
Exemplo n.º 38
0
    def multiFieldsSearch(self, query, sim):
        """
        Method that searches through documents using content_section and title_article Fields
        searchDir : the path to the folder that contains the index.
        """
        # Now search the index:
        lucene.getVMEnv().attachCurrentThread()

        parser = MultiFieldQueryParser(["content_section", "title_article"],
                                       self.analyzer)
        parser.setDefaultOperator(QueryParserBase.OR_OPERATOR)
        query = MultiFieldQueryParser.parse(parser, QueryParser.escape(query))

        self.searcher.setSimilarity(sim)
        hits = self.searcher.search(query, 6).scoreDocs
        return hits
Exemplo n.º 39
0
    def createIndexStopwords(texts, route, rebuild):
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()

        stopWords = [
            "a", "an", "and", "are", "as", "at", "be", "but", "by", "for",
            "if", "no", "not", "of", "on", "or", "such", "that", "the",
            "their", "then", "there", "these", "they", "this", "to", "was",
            "will", "with", "el", "la", "lo", "los", "las", "ante", "con",
            "sin", "que", "es", "de", "en", "por", "y", "los"
        ]
        stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT,
                                              stopWords)
        analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet)
        directory = SimpleFSDirectory(File(route))
        conf = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
        if rebuild:
            conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        else:
            conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)

        iwriter = IndexWriter(directory, conf)

        for key in texts:
            doc = Document()
            doc.add(
                Field("docName", key.__str__(), Field.Store.YES,
                      Field.Index.NOT_ANALYZED))
            doc.add(
                Field("content", texts[key], Field.Store.YES,
                      Field.Index.ANALYZED, Field.TermVector.YES))
            iwriter.addDocument(doc)

        iwriter.close()
Exemplo n.º 40
0
    def run(self):

        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()

        # yeah, this should be refactored
        if "search" in self.action.keys():
            self.run_searcher(self.action['search'])
        if "delete" in self.action.keys():
            self.delete_index(self.action['delete'])
        if "export_tdm" in self.action.keys():
            self.export_TDM(self.action['export_tdm'])
        if "export_tdm_csv" in self.action.keys():
            self.export_TDM_csv(self.action['export_tdm_csv'])
        if "export_tdm_stm" in self.action.keys():
            self.export_TDM_stm(self.action['export_tdm_stm'])
        if "export_contents" in self.action.keys():
            self.export_contents(self.action['export_contents'])
        if "import_directory" in self.action.keys():
            self.import_directory(self.action['import_directory'])
        if "import_csv" in self.action.keys():
            self.import_csv(self.action['import_csv'])
        if "import_csv_with_content" in self.action.keys():
            self.import_csv_with_content(*self.action['import_csv_with_content'])
        if "rebuild_metadata_cache" in self.action.keys():
            self.rebuild_metadata_cache(*self.action['rebuild_metadata_cache'])
        if "reindex" in self.action.keys():
            self.reindex()
Exemplo n.º 41
0
    def getMostFrequentTermNoStopwords(route, query):
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()
        stopWords = []

        stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT,
                                              stopWords)
        analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet)
        directory = SimpleFSDirectory(File(route))

        ireader = IndexReader.open(directory)

        currentTerm = ""
        currentTermFreq = 0
        for doc in range(ireader.numDocs()):
            terms = ireader.getTermVector(doc, "content")
            if terms is not None:
                termsEnum = terms.iterator(None)
                for term in BytesRefIterator.cast_(termsEnum):
                    text = term.utf8ToString()
                    t = Term("content", term)
                    freq = ireader.totalTermFreq(t)
                    if freq > currentTermFreq and text not in query:
                        currentTerm = text
                        currentTermFreq = freq
        return currentTerm
Exemplo n.º 42
0
def run(command):
    global vm_env
    STORE_DIR = "index"
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    query = QueryParser(Version.LUCENE_CURRENT, "contents",
                        analyzer).parse(command)
    scoreDocs = searcher.search(query, 10).scoreDocs
    #print "%s total matching documents." % len(scoreDocs)
    res = []
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        tmp = []
        tmp.append([doc.get('name1'), doc.get('name2')])
        tmp.append(doc.get("homepage"))
        tmp.append(doc.get("intro"))
        tmp.append(doc.get('logo'))
        a = doc.get('goods')
        a = a.split('\n')
        for i in a:
            tmp.append(i)
        res.append(tmp)

    return command, res
Exemplo n.º 43
0
def init(vmargs='-Xrs,-Djava.awt.headless=true', **kwargs):
    """Callback to initialize VM and app roots after daemonizing."""
    assert lucene.getVMEnv() or lucene.initVM(vmargs=vmargs, **kwargs)
    for app in cherrypy.tree.apps.values():
        if isinstance(app.root, WebSearcher):
            app.root.__init__(*app.root.__dict__.pop('args'),
                              **app.root.__dict__.pop('kwargs'))
Exemplo n.º 44
0
def get_image_pmcid(pmcid, classes = ""):
    fields = ["pmcid", "class"]
    docs = []
    location = web.__path__[0] + "/static/web/files/index/index.figures"
    #lucene.initVM()
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
    reader = IndexReader.open(SimpleFSDirectory(File(location)))
    searcher = IndexSearcher(reader)
    # multi field query: http://stackoverflow.com/questions/2005084/how-to-specify-two-fields-in-lucene-queryparser
    
    #query = MultiFieldQueryParser(Version.LUCENE_4_10_1, fields, analyzer)
    # query.setDefaultOperator(QueryParserBase.AND_OPERATOR)
    
    #query = query.parse(query, ('4175339','1'))
    # query.parse(queryString)#"Shigella sonnei"
    # query = QueryParser(Version.LUCENE_4_10_1, "abstract", analyzer).parse(queryString)#"Shigella sonnei"

    MAX = 10000
    #hits = searcher.search(query, MAX)
    if classes == "all":
        queryStr = "pmcid:(" + ' '.join(pmcid) +")"
    else:
        queryStr = "pmcid:(" + ' '.join(pmcid) +")" + " AND class:" + classes
    query = QueryParser(Version.LUCENE_4_10_1, "pmcid",analyzer)#needed to build a custom query
    q = query.parse(queryStr) 
    hits = searcher.search(q, MAX)
    for hit in hits.scoreDocs:#should only be one
        #print hit.score, hit.doc, hit.toString()
        docs.append(searcher.doc(hit.doc))
    return docs #This will return the image documents that belong to a pmcid(article)
Exemplo n.º 45
0
def getRandomDoc2():
    
        location = web.__path__[0] + "/static/web/files/index/index.articles"
        #lucene.initVM()
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()
        analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
        reader = IndexReader.open(SimpleFSDirectory(File(location)))
        searcher = IndexSearcher(reader)
     
        #query = QueryParser(Version.LUCENE_4_10_1, "keywords", analyzer).parse(queryString)#"Shigella sonnei"
        MAX = 1000
        docNum = randrange(0, reader.maxDoc())
        doc = reader.document(docNum)
     
        #print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query)
        files = []
        fileRoots = []
        paths = []
        paths.append(doc.get("articlepath"))
        pth = paths[0].replace("/home/kevin/Downloads/","/home/kevin/git/YIF/imageFinder/web/static/web/")#os.path.join(tools.__path__,"static/web/images")
        for root, directories, filenames in os.walk(pth):#probably something wrong with the location
            for filename in filenames:
                if (".jpg" or ".gif" or ".png") in filename:
                    files.append(root.replace("/home/kevin/git/YIF/imageFinder/web/static/web/","") + "/" +filename)#temp, will need to chance            
                    fileRoots.append(root)
                    print (root.replace("/home/kevin/git/YIF/imageFinder/web/static/web/","") + "/" + filename)
        try: 
            rng = randrange(0, len(files))
        except:
            return -1
        else:
             return files[randrange(0, len(files))]
Exemplo n.º 46
0
    def getTermVectors(route):
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()
        stopWords = []
        stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT,
                                              stopWords)
        analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet)
        directory = SimpleFSDirectory(File(route))

        ireader = IndexReader.open(directory)
        ls = []
        for doc in range(ireader.numDocs()):
            vector = FreqVector()
            vector.vector = []
            vector.freqs = []

            norm = 0.0
            terms = ireader.getTermVector(doc, "content")
            if (terms is not None):
                termsEnum = terms.iterator(None)
                for term in BytesRefIterator.cast_(termsEnum):
                    text = term.utf8ToString()
                    tf = 1 + math.log(termsEnum.totalTermFreq(), 2)
                    t = Term("content", term)
                    idf = math.log(ireader.numDocs() / ireader.docFreq(t))
                    vector.vector.append(text)
                    vector.freqs.append(tf * idf)
                    norm += (tf * idf) * (tf * idf)
                ls.append((vector, math.sqrt(norm)))
            else:
                ls.append((vector, 0))
        return ls
Exemplo n.º 47
0
    def run(self):

        from lucene import getVMEnv
        self._vmEnv = env = getVMEnv()
        if env is not None:
            env.attachCurrentThread()

        super(RepositoryThread, self).run()
Exemplo n.º 48
0
def SearchQuery(queryString, fields, classification): 
    #if __name__ == "__main__":
    #if __name__ == "retriever":
    location = web.__path__[0] + "/static/web/files/index/index.articles"
    #lucene.initVM()
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
    reader = IndexReader.open(SimpleFSDirectory(File(location)))
    searcher = IndexSearcher(reader)
    #multi field query: http://stackoverflow.com/questions/2005084/how-to-specify-two-fields-in-lucene-queryparser
    
    query = MultiFieldQueryParser(Version.LUCENE_4_10_1, fields, analyzer)
    #query.setDefaultOperator(QueryParserBase.AND_OPERATOR)
    query = MultiFieldQueryParser.parse(query, queryString)
    #query.parse(queryString)#"Shigella sonnei"
    #query = QueryParser(Version.LUCENE_4_10_1, "abstract", analyzer).parse(queryString)#"Shigella sonnei"

    MAX = 10000
    hits = searcher.search(query, MAX)
 
    print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query)
    paths = []
    pmcids = []
    documentDict = {}
    for hit in hits.scoreDocs:
        doc = searcher.doc(hit.doc)
        pmcids.append(doc.get("pmcid"))
        docDict = {"title" : doc.get("title")}#we can add any other field we want...
        documentDict[doc.get("pmcid")] = docDict 
    
    #Where we get the images for all the pmcids    
    images = get_image_pmcid(pmcids, classification)#should take in pmcids and class
    #create dictionary of images with pmcid being their key
    imagesDict = {}
    for img in images:
        img_pmcid = img.get("pmcid") 
        if img_pmcid in imagesDict.keys():
            imagesDict[img_pmcid].append(img.get("filepath") + "/" + img.get("figureid"))
            
        else:
            imagesDict[img_pmcid] = [(img.get("filepath") + "/" + img.get("figureid"))]
            
    #for each pmcid, we will assign an image to it for the search results
    for pmcid in pmcids:
        if imagesDict:
            docDict = documentDict[pmcid]
            docDict["imgURL"] = imagesDict[pmcid][0] 
            documentDict[pmcid] = docDict 
        else:
            docDict = documentDict[pmcid]
            docDict["imgURL"] = "images/NoImageAvailable.jpg"
            documentDict[pmcid] = docDict
    
    #END - Where we get the images for all the pmcids
    
    
    return documentDict
Exemplo n.º 49
0
 def startOaiPmh(self, portNumber, oaiJazz, storageComponent, register):
     getVMEnv().attachCurrentThread()
     reactor = Reactor()
     server = be(
         (Observable(),
             (ObservableHttpServer(reactor, portNumber),
                 (OaiPmh(repositoryName='repositoryName', adminEmail='adminEmail', batchSize=2, supportXWait=True),
                     (register,),
                     (oaiJazz,
                         (register,),
                     ),
                     (storageComponent,)
                 )
             )
         )
     )
     list(compose(server.once.observer_init()))
     self._loopReactor(reactor)
 def __init__(self):
     """
     Inits a Reader by attaching the current luceneVM to the thread and creating a store and a IndexReader instance.
     """
     vm_env = lucene.getVMEnv()  # get lucene.vm
     vm_env.attachCurrentThread()
     self.analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT)
     self.store = lucene.SimpleFSDirectory(lucene.File(DIR))
     self.reader = lucene.IndexReader.open(self.store, True)
 def __init__(self):
     """
     Inits a Writer by attaching the current luceneVM to the thread and creating a analyzer, a store and a IndexWriter instance.
     """
     vm_env = lucene.getVMEnv()  # get lucene.vm
     vm_env.attachCurrentThread()
     self.analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT)
     self.store = lucene.SimpleFSDirectory(lucene.File(DIR))
     self.writer = lucene.IndexWriter(self.store, self.analyzer, True, lucene.IndexWriter.MaxFieldLength(512))
 def __init__(self):
     """
     Inits a Searcher by attaching the current luceneVM to the thread and creating a analyzer, a store, a parser and a IndexSearcher instance.
     """
     vm_env = lucene.getVMEnv()
     vm_env.attachCurrentThread()
     self.analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT)
     self.store = lucene.SimpleFSDirectory(lucene.File(DIR))
     self.parser = lucene.MultiFieldQueryParser(lucene.Version.LUCENE_CURRENT, ["content"], self.analyzer)
     self.searcher = lucene.IndexSearcher(self.store, readOnly=True)
Exemplo n.º 53
0
def _start_indexer(config, db_name):
    jcc_evn = lucene.getVMEnv()
    jcc_evn.attachCurrentThread()

    while True:
        restart = _run_indexer(config, db_name)
        if not restart:
            print "Exit db indexer %s" % db_name
            break
        print "Restarted db indexer %s" % db_name
Exemplo n.º 54
0
def doSearch(queryS, field="id", defaultField="data"):
    lucene.getVMEnv().attachCurrentThread()
    store = getStore()
    searcher = getSearcher(store)
    analyzer = getAnalyzer()
    parser = lucene.QueryParser(defaultField, analyzer)
    query = parser.parse(queryS)
    query = query.rewrite(getReader(store))
    hits = searcher.search(query)
    
    results = []
    
    for i in range(0, hits.length()):
        results.append(hits.doc(i).get(field))
    
    searcher.close()
    store.close()
            
    return results
Exemplo n.º 55
0
 def search(self, q=None):
     lucene.getVMEnv().attachCurrentThread()
     if q is None or not q.strip():
         search = False
         query = ''
         query_raw = ''
         hits = 0
         places = []
     else:
         search = True
         query_raw = q.replace('"', '')
         query = utils.escape_html(q)
         hits, places = self.storage.search(q, ontology=self.ontology)
     return tmpl_lookup.get_template('search.mako').render_unicode(
         search=search,
         query=query,
         query_raw=query_raw,
         hits=hits,
         places=places)
Exemplo n.º 56
0
    def __init__(self):
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()

        indexDir = SimpleFSDirectory(File(app.config['INDEX_PATH']))
        self.searcher = IndexSearcher(DirectoryReader.open(indexDir))

        self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

        self.parser = QueryParser(Version.LUCENE_CURRENT, "contents", self.analyzer)
Exemplo n.º 57
0
def doSearch(queryS, field="id"):
	lucene.getVMEnv().attachCurrentThread()
	lucene.BooleanQuery.setMaxClauseCount(sys.maxint)
	store = getStore()
	searcher = getSearcher(store)
	analyzer = getAnalyzer()
	parser = lucene.QueryParser("ssid", analyzer)
	query = parser.parse(queryS)
	query = query.rewrite(getReader(store))
	hits = searcher.search(query)
	
	results = []
	
	for i in range(0, hits.length()):
		results.append(hits.doc(i).get(field))
	
	searcher.close()
	store.close()
			
	return results
Exemplo n.º 58
0
 def __init__(self, rows=None):
     #lucene.initVM()
     # Django의 setttings.py 에 lucene.initVM() 설정 후 불러다 사용
     vm_env = lucene.getVMEnv()
     if vm_env == None:
         lucene.initVM()
     else:
         vm_env.attachCurrentThread()
     self.analyzer = lucene.StandardAnalyzer(Version.LUCENE_30)
     self.indexDir = SimpleFSDirectory(File(INDEX_DIRECTORY))
     self.rows = rows
Exemplo n.º 59
0
def query(s):
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    print (s)
    a = WebIndexer()
    result = []
    try:
        res = a.query(u'name:"'+' '.join(jieba.cut(s, cut_all=False))+'" ', 'name')
    except Exception, e:
        print (e)
        print unicode(e.getJavaException())
def _importVM():
    maxheap = getenv('PYLUCENE_MAXHEAP')
    if not maxheap:
        maxheap = '4g'
        warn("Using '4g' as maxheap for lucene.initVM(). To override use PYLUCENE_MAXHEAP environment variable.")
    from lucene import initVM, getVMEnv
    try:
        VM = initVM(maxheap=maxheap)
        # VM = initVM(maxheap=maxheap, vmargs='-agentlib:hprof=heap=sites')
    except ValueError:
        VM = getVMEnv()
    return VM